In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, hour, col
from pyspark.sql.types import *
import pyspark.sql.utils
import os

In [2]:
# change this parameter accordingly to your machine
FILE_DIR = "file:///home/p4stwi2x/Desktop/abs/taxi-data/"

spark = SparkSession \
    .builder \
    .appName("StructuredNetworkWordCount") \
    .getOrCreate()

# main query

csvSchema = StructType([StructField("type", StringType(), True),
                        StructField("VendorID", IntegerType(), True),
                        StructField("tpep_pickup_datetime", TimestampType(), True),
                        StructField("tpep_dropoff_datetime",TimestampType(), True)])

streamingInputDF = (
  spark
    .readStream
    .schema(csvSchema)
    # .option("maxFilesPerTrigger", 1)
    .csv(FILE_DIR)
)

streamingCountsDF = (
  streamingInputDF
    .groupBy(
      window(streamingInputDF.tpep_dropoff_datetime, "1 hour"))
    .count()
)

streamingCountsDF.printSchema()

#main query
query = (
  streamingCountsDF
    .writeStream
    .format("memory")         # console or memory(= store in-memory table)
    .queryName("counts")      # counts = name of the in-memory table
    .outputMode("complete")
    # .option("truncate", "false")
    .start()
)

query.processAllAvailable()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- count: long (nullable = false)



In [3]:
query.stop()

In [4]:
import pyspark.sql.utils

try:
    spark.sql('select * from counts order by window').show(24, truncate=False)
    print ("Query executed")
except pyspark.sql.utils.AnalysisException:
    print("Unable to process your query!!")

+------------------------------------------+-----+
|window                                    |count|
+------------------------------------------+-----+
|{2015-12-01 00:00:00, 2015-12-01 01:00:00}|7396 |
|{2015-12-01 01:00:00, 2015-12-01 02:00:00}|5780 |
|{2015-12-01 02:00:00, 2015-12-01 03:00:00}|3605 |
|{2015-12-01 03:00:00, 2015-12-01 04:00:00}|2426 |
|{2015-12-01 04:00:00, 2015-12-01 05:00:00}|2505 |
|{2015-12-01 05:00:00, 2015-12-01 06:00:00}|3858 |
|{2015-12-01 06:00:00, 2015-12-01 07:00:00}|10258|
|{2015-12-01 07:00:00, 2015-12-01 08:00:00}|19007|
|{2015-12-01 08:00:00, 2015-12-01 09:00:00}|23799|
|{2015-12-01 09:00:00, 2015-12-01 10:00:00}|24003|
|{2015-12-01 10:00:00, 2015-12-01 11:00:00}|21179|
|{2015-12-01 11:00:00, 2015-12-01 12:00:00}|20219|
|{2015-12-01 12:00:00, 2015-12-01 13:00:00}|20522|
|{2015-12-01 13:00:00, 2015-12-01 14:00:00}|20556|
|{2015-12-01 14:00:00, 2015-12-01 15:00:00}|21712|
|{2015-12-01 15:00:00, 2015-12-01 16:00:00}|22016|
|{2015-12-01 16:00:00, 2015-12-

In [5]:
# write into files
output_path_ex02 = 'file:///home/p4stwi2x/Desktop/abs/output_task_2'
hour_count = spark.sql('select * from counts order by window')\
        .withColumn("temp", (hour(col('window').start) + 1) * 360000)
for hour in hour_count.collect():
    timestamp_count = hour['temp']
    windows_data = hour['window']
    windows_count = hour['count']
    output_dir = os.path.join(output_path_ex02, f"output-{timestamp_count}")

    df_windows = spark.createDataFrame([(windows_count,)], ["count"])

    df_windows.write.mode("overwrite")\
            .format("csv")\
            .option("header", "true")\
            .save(output_dir)

    print(f"Timestamp {timestamp_count} has been exported to folder {output_dir}")

Timestamp 360000 has been exported to folder file:///home/p4stwi2x/Desktop/abs/output_task_2/output-360000
Timestamp 720000 has been exported to folder file:///home/p4stwi2x/Desktop/abs/output_task_2/output-720000
Timestamp 1080000 has been exported to folder file:///home/p4stwi2x/Desktop/abs/output_task_2/output-1080000
Timestamp 1440000 has been exported to folder file:///home/p4stwi2x/Desktop/abs/output_task_2/output-1440000
Timestamp 1800000 has been exported to folder file:///home/p4stwi2x/Desktop/abs/output_task_2/output-1800000
Timestamp 2160000 has been exported to folder file:///home/p4stwi2x/Desktop/abs/output_task_2/output-2160000
Timestamp 2520000 has been exported to folder file:///home/p4stwi2x/Desktop/abs/output_task_2/output-2520000
Timestamp 2880000 has been exported to folder file:///home/p4stwi2x/Desktop/abs/output_task_2/output-2880000
Timestamp 3240000 has been exported to folder file:///home/p4stwi2x/Desktop/abs/output_task_2/output-3240000
Timestamp 3600000 has b

In [None]:
spark.stop()