In [1]:
import findspark
findspark.init()
import pyspark
# Creating a SparkSession in Python
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local")\
          .appName("Spark Streaming Demonstration")\
          .config("spark.some.config.option", "some-value")\
          .getOrCreate()
# keep the size of shuffles small
spark.conf.set("spark.sql.shuffle.partitions", "2") 

In [2]:
from pyspark.sql.types import *
# input path in local filesystem
inputPath = '../data/'

In [3]:
staticInputDF = (
  spark
    .read    
    .csv(inputPath)
)
schema = staticInputDF.schema

In [4]:
schema

StructType([StructField('_c0', StringType(), True), StructField('_c1', StringType(), True), StructField('_c2', StringType(), True), StructField('_c3', StringType(), True), StructField('_c4', StringType(), True), StructField('_c5', StringType(), True), StructField('_c6', StringType(), True), StructField('_c7', StringType(), True), StructField('_c8', StringType(), True), StructField('_c9', StringType(), True), StructField('_c10', StringType(), True), StructField('_c11', StringType(), True), StructField('_c12', StringType(), True), StructField('_c13', StringType(), True), StructField('_c14', StringType(), True), StructField('_c15', StringType(), True), StructField('_c16', StringType(), True), StructField('_c17', StringType(), True), StructField('_c18', StringType(), True), StructField('_c19', StringType(), True)])

In [5]:
import pyspark.sql.functions as f
from pyspark.sql.functions import *

# Similar to definition of staticInputDF above, just using `readStream` instead of `read`
streamingInputDF = (
  spark
    .readStream                       
    .schema(schema)                # Set the schema of the csv data
    .option("maxFilesPerTrigger", 1440)  # Treat a sequence of files as a stream by picking one file at a time
    .csv(inputPath)
)
# cast the Timestamp type since it is not automatically parsed
streamingInputDF = streamingInputDF.select(f.col('_c0').alias('Action'), f.col('_c3').alias('Time'))
# Same query as staticInputDF
streamingCountsDF = (                 
  streamingInputDF
    .groupBy(
      window(streamingInputDF.Time, "1 hour"))
    .count()
)
# Is this DF actually a streaming DF?
streamingCountsDF.isStreaming

True

In [6]:
# This query stores the aggregation results in memory then visualize it
query = (
  streamingCountsDF
    .writeStream
    .format("memory")      
    .queryName("counts")   
    .outputMode("complete") 
    .option("truncate", "false")
    .start()
)
query.awaitTermination(60)

query.stop()

result = spark.sql('select * from counts order by window')

result.show(result.count(), truncate=False)

+------------------------------------------+-----+
|window                                    |count|
+------------------------------------------+-----+
|{2015-12-01 00:00:00, 2015-12-01 01:00:00}|7396 |
|{2015-12-01 01:00:00, 2015-12-01 02:00:00}|5780 |
|{2015-12-01 02:00:00, 2015-12-01 03:00:00}|3605 |
|{2015-12-01 03:00:00, 2015-12-01 04:00:00}|2426 |
|{2015-12-01 04:00:00, 2015-12-01 05:00:00}|2505 |
|{2015-12-01 05:00:00, 2015-12-01 06:00:00}|3858 |
|{2015-12-01 06:00:00, 2015-12-01 07:00:00}|10258|
|{2015-12-01 07:00:00, 2015-12-01 08:00:00}|19007|
|{2015-12-01 08:00:00, 2015-12-01 09:00:00}|23799|
|{2015-12-01 09:00:00, 2015-12-01 10:00:00}|24003|
|{2015-12-01 10:00:00, 2015-12-01 11:00:00}|21179|
|{2015-12-01 11:00:00, 2015-12-01 12:00:00}|20219|
|{2015-12-01 12:00:00, 2015-12-01 13:00:00}|20522|
|{2015-12-01 13:00:00, 2015-12-01 14:00:00}|20556|
|{2015-12-01 14:00:00, 2015-12-01 15:00:00}|21712|
|{2015-12-01 15:00:00, 2015-12-01 16:00:00}|22016|
|{2015-12-01 16:00:00, 2015-12-

In [None]:
import os

count_increment = 360000
count = count_increment

for row in result.collect():
    new_path = f'./output-{count}'

    os.makedirs(new_path, exist_ok=True)

    with open(f"{new_path}/output-{count}.txt", "w") as file:
        file.write(str(row['count']))
        
    count += count_increment
