In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from shapely.geometry import Point, Polygon
import pyspark.sql.utils
import os

In [2]:
# change this parameter accordingly to your machine
FILE_DIR = "file:///home/p4stwi2x/Desktop/abs/taxi-data/"

spark = SparkSession.builder.master("local")\
          .appName("RegionEventCount")\
          .config("spark.some.config.option", "some-value")\
          .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", "2")

In [3]:
csvSchema = StructType([StructField("type", StringType(), True),
                        StructField("VendorID", IntegerType(), True),
                        StructField("tpep_pickup_datetime", TimestampType(), True),
                        StructField("tpep_dropoff_datetime",TimestampType(), True),

                        StructField("blankCol1", StringType(), True),
                        StructField("blankCol2", StringType(), True),
                        StructField("blankCol3", StringType(), True),
                        StructField("blankCol4", StringType(), True),

                        StructField("long_green",DoubleType(), True),
                        StructField("lat_green", DoubleType(), True),
                        StructField("long_yellow", DoubleType(), True),
                        StructField("lat_yellow", DoubleType(), True)])

In [12]:
streamingInputDF_ex03 = (
  spark
    .readStream
    .schema(csvSchema)
    # .option("maxFilesPerTrigger", 1)
    .csv(FILE_DIR)
)

df_with_lat_long = streamingInputDF_ex03.withColumn(
    "lat",
    when(col("type") == "green", col("lat_green"))
    .when(col("type") == "yellow", col("lat_yellow"))
).withColumn(
    "long",
    when(col("type") == "green", col("long_green"))
    .when(col("type") == "yellow", col("long_yellow"))
)

df_with_lat_long.printSchema()

root
 |-- type: string (nullable = true)
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- blankCol1: string (nullable = true)
 |-- blankCol2: string (nullable = true)
 |-- blankCol3: string (nullable = true)
 |-- blankCol4: string (nullable = true)
 |-- long_green: double (nullable = true)
 |-- lat_green: double (nullable = true)
 |-- long_yellow: double (nullable = true)
 |-- lat_yellow: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)



In [13]:
goldman = [[-74.0141012, 40.7152191], [-74.013777, 40.7152275], [-74.0141027, 40.7138745], [-74.0144185, 40.7140753]]
citigroup = [[-74.011869, 40.7217236], [-74.009867, 40.721493], [-74.010140,40.720053], [-74.012083, 40.720267]]
def get_HQ(long, lat):
    pt = Point(long, lat)
    gd_p = Polygon(goldman)
    ct_p = Polygon(citigroup)

    
    if gd_p.contains(pt):
        return "goldman"
    elif ct_p.contains(pt):
        return "citigroup"
    return "unknown"

spark.udf.register("getHQ", get_HQ)

<function __main__.get_HQ(long, lat)>

In [14]:
df_with_lat_long = df_with_lat_long\
    .withColumn("drop_loc", expr("getHQ(long, lat)"))\
    .where(col("drop_loc") != "unknown")

df_with_lat_long.printSchema()

root
 |-- type: string (nullable = true)
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- blankCol1: string (nullable = true)
 |-- blankCol2: string (nullable = true)
 |-- blankCol3: string (nullable = true)
 |-- blankCol4: string (nullable = true)
 |-- long_green: double (nullable = true)
 |-- lat_green: double (nullable = true)
 |-- long_yellow: double (nullable = true)
 |-- lat_yellow: double (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- drop_loc: string (nullable = true)



In [16]:
streamingCountsDF = (
  df_with_lat_long.select("drop_loc", "tpep_dropoff_datetime")
    .groupBy(
        df_with_lat_long.drop_loc,
        window(df_with_lat_long.tpep_dropoff_datetime, "1 hour"))
    .count()
)

query = (
  streamingCountsDF
    .writeStream
    .format("memory")         # console or memory(= store in-memory table)
    .queryName("counts")      # counts = name of the in-memory table
    .outputMode("complete")
    # .option("truncate", "false")
    .start()
)

query.processAllAvailable()

In [17]:
query.stop()

In [18]:
try:
    spark.sql("select * from counts order by window").show(truncate=False)
    print ("Query executed")
except pyspark.sql.utils.AnalysisException:
    print("Unable to process your query!!")

+---------+------------------------------------------+-----+
|drop_loc |window                                    |count|
+---------+------------------------------------------+-----+
|citigroup|{2015-12-01 00:00:00, 2015-12-01 01:00:00}|5    |
|citigroup|{2015-12-01 01:00:00, 2015-12-01 02:00:00}|2    |
|citigroup|{2015-12-01 02:00:00, 2015-12-01 03:00:00}|1    |
|citigroup|{2015-12-01 04:00:00, 2015-12-01 05:00:00}|1    |
|citigroup|{2015-12-01 05:00:00, 2015-12-01 06:00:00}|8    |
|goldman  |{2015-12-01 05:00:00, 2015-12-01 06:00:00}|3    |
|goldman  |{2015-12-01 06:00:00, 2015-12-01 07:00:00}|11   |
|citigroup|{2015-12-01 06:00:00, 2015-12-01 07:00:00}|46   |
|goldman  |{2015-12-01 07:00:00, 2015-12-01 08:00:00}|17   |
|citigroup|{2015-12-01 07:00:00, 2015-12-01 08:00:00}|62   |
|goldman  |{2015-12-01 08:00:00, 2015-12-01 09:00:00}|25   |
|citigroup|{2015-12-01 08:00:00, 2015-12-01 09:00:00}|56   |
|citigroup|{2015-12-01 09:00:00, 2015-12-01 10:00:00}|60   |
|goldman  |{2015-12-01 0

In [20]:
from pyspark.sql.functions import hour

output_path_ex03 = 'file:///home/p4stwi2x/Desktop/abs/output_task_3'
hour_count = spark.sql('select * from counts order by window').withColumn("temp", (hour('window.start')+1)*360000)
for hour in hour_count.collect():
    timestamp_count = hour['temp']
    windows_data = hour['drop_loc']
    windows_count = hour['count']
    output_dir = os.path.join(output_path_ex03, f"output-{timestamp_count}")

    # Create a DataFrame for the current hour's data
    df_windows = spark.createDataFrame([(windows_data, windows_count,)], ["headquarter","count"])

    """if not os.path.exists(output_dir):
        # If directory does not exist, write the new DataFrame
        df_windows.write.mode("overwrite")\
                .format("csv")\
                .option("header", "true")\
                .save(output_dir)
        print(f"Timestamp {timestamp_count} has been exported to folder {output_dir}")
    else:"""
        # If directory exists, read and write to the existing CSV
    df_windows.write.mode("append")\
            .format("csv")\
            .option("header", "true")\
            .save(output_dir)
    print(f"Timestamp {timestamp_count} has been appended to folder {output_dir}")

Timestamp 360000 has been appended to folder file:///home/p4stwi2x/Desktop/abs/output_task_3/output-360000
Timestamp 720000 has been appended to folder file:///home/p4stwi2x/Desktop/abs/output_task_3/output-720000
Timestamp 1080000 has been appended to folder file:///home/p4stwi2x/Desktop/abs/output_task_3/output-1080000
Timestamp 1800000 has been appended to folder file:///home/p4stwi2x/Desktop/abs/output_task_3/output-1800000
Timestamp 2160000 has been appended to folder file:///home/p4stwi2x/Desktop/abs/output_task_3/output-2160000
Timestamp 2160000 has been appended to folder file:///home/p4stwi2x/Desktop/abs/output_task_3/output-2160000
Timestamp 2520000 has been appended to folder file:///home/p4stwi2x/Desktop/abs/output_task_3/output-2520000
Timestamp 2520000 has been appended to folder file:///home/p4stwi2x/Desktop/abs/output_task_3/output-2520000
Timestamp 2880000 has been appended to folder file:///home/p4stwi2x/Desktop/abs/output_task_3/output-2880000
Timestamp 2880000 has b

In [21]:
spark.stop()