# Command hỗ trợ

In [None]:
spark.stop()

# Task 4

### Khai báo các biến cần thiết

In [1]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType

path = "./taxi-data/"

# Define the input schema
yellow_schema = StructType([
    StructField("type", StringType(), True),
    StructField("VendorID", IntegerType(), True),
    StructField("tpep_pickup_datetime", TimestampType(), True),
    StructField("tpep_dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", DoubleType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("RatecodeID", DoubleType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", DoubleType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True)
])

green_schema = StructType([
    StructField("type", StringType(), True),
    StructField("VendorID", IntegerType(), True),
    StructField("lpep_pickup_datetime", TimestampType(), True),
    StructField("Lpep_dropoff_datetime", TimestampType(), True),
    StructField("Store_and_fwd_flag", StringType(), True),
    StructField("RateCodeID", DoubleType(), True),
    StructField("Pickup_longitude", DoubleType(), True),
    StructField("Pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("Passenger_count", DoubleType(), True),
    StructField("Trip_distance", DoubleType(), True),
    StructField("Fare_amount", DoubleType(), True),
    StructField("Extra", DoubleType(), True),
    StructField("MTA_tax", DoubleType(), True),
    StructField("Tip_amount", DoubleType(), True),
    StructField("Tolls_amount", DoubleType(), True),
    StructField("Ehail_fee", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("Total_amount", DoubleType(), True),
    StructField("Payment_type", DoubleType(), True),
    StructField("Trip_type", DoubleType(), True)
])


default_schema = StructType([
    StructField("_c0", StringType(), True),
    StructField("_c1", StringType(), True),
    StructField("_c2", StringType(), True),
    StructField("_c3", StringType(), True),
    StructField("_c4", StringType(), True),
    StructField("_c5", StringType(), True),
    StructField("_c6", StringType(), True),
    StructField("_c7", StringType(), True),
    StructField("_c8", StringType(), True),
    StructField("_c9", StringType(), True),
    StructField("_c10", StringType(), True),
    StructField("_c11", StringType(), True),
    StructField("_c12", StringType(), True),
    StructField("_c13", StringType(), True),
    StructField("_c14", StringType(), True),
    StructField("_c15", StringType(), True),
    StructField("_c16", StringType(), True),
    StructField("_c17", StringType(), True),
    StructField("_c18", StringType(), True),
    StructField("_c19", StringType(), True),
    StructField("_c20", StringType(), True),
    StructField("_c21", StringType(), True)
])


goldman = [(-74.0141012, 40.7152191), (-74.013777, 40.7152275), (-74.0141027, 40.7138745), (-74.0144185, 40.7140753)]
# goldman = [(40.7152191, -74.0141012), (40.7152275, -74.013777), (40.7138745, -74.0141027), (40.7140753, -74.0144185)]
citigroup = [(-74.011869, 40.7217236), (-74.009867, 40.721493), (-74.010140,40.720053), (-74.012083, 40.720267)]
# citigroup = "[(40.7217236, -74.011869), (40.721493, -74.009867), (40.720053, -74.01014), (40.720267, -74.012083)]"

In [2]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from shapely.geometry import Point, Polygon

from shapely.geometry import Point, Polygon

# Convert each coordinate pair to a Point object
goldman_points = [Point(coords[0], coords[1]) for coords in goldman]
citigroup_points = [Point(coords[0], coords[1]) for coords in citigroup]

# Create the Polygon object using the list of points
goldman_polygon = Polygon(goldman_points)
citigroup_polygon = Polygon(citigroup_points)

# Define a UDF to check if a point is inside the Goldman Sachs or Citigroup polygon
def inside_polygon(longitude, latitude, polygon):
    point = Point(longitude, latitude)
    return point.within(polygon)

# Adding a column to indicate the drop-off location
def get_dropoff_location(longitude, latitude):
    point = Point(longitude, latitude)
    if point.within(goldman_polygon):
        return "Goldman Sachs"
    elif point.within(citigroup_polygon):
        return "Citigroup"
    else:
        return None


# Register UDFs for both locations
inside_goldman = udf(lambda lon, lat: inside_polygon(lon, lat, goldman_polygon), BooleanType())
inside_citigroup = udf(lambda lon, lat: inside_polygon(lon, lat, citigroup_polygon), BooleanType())

# Register UDFs for get_dropoff_location
get_dropoff_location_udf = udf(get_dropoff_location, StringType())


### Xử lý

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import col, lit
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Create SparkSession
spark = SparkSession.builder.appName("TrendingArrivals").getOrCreate()

# Read the input stream using the default schema
input_stream = spark.readStream \
    .schema(default_schema) \
    .csv(path)

# Filter and apply schema for yellow taxi trips
yellow_trips = input_stream.filter(col("_c0") == "yellow") \
                           .drop("_c20","_c21") \
                           .toDF(*yellow_schema.names) \

# Cast each column based on yellow_schema data types
for col_name, field in zip(yellow_schema.names, yellow_schema.fields):
    yellow_trips = yellow_trips.withColumn(col_name, yellow_trips[col_name].cast(field.dataType))

# yellow_trips.printSchema()

# Filter and apply schema for green taxi trips
green_trips = input_stream.filter(col("_c0") == "green") \
                          .toDF(*green_schema.names)

# Cast each column based on yellow_schema data types
for col_name, field in zip(green_schema.names, green_schema.fields):
    green_trips = green_trips.withColumn(col_name, green_trips[col_name].cast(field.dataType))

# green_trips.printSchema()

# Filter the stream for dropoff locations within the bounding boxes
# yellow
goldman_sachs_yellow_arrivals = yellow_trips \
  .filter(inside_goldman(col("dropoff_longitude"), col("dropoff_latitude"))) \
  .select("tpep_dropoff_datetime") \
  .withColumnRenamed("tpep_dropoff_datetime","dropoff_datetime") \
  .withColumn("location", lit("goldman"))

citigroup_yellow_arrivals = yellow_trips \
  .filter(inside_citigroup(col("dropoff_longitude"), col("dropoff_latitude"))) \
  .select("tpep_dropoff_datetime") \
  .withColumnRenamed("tpep_dropoff_datetime","dropoff_datetime") \
  .withColumn("location", lit("citigroup"))

# green
goldman_sachs_green_arrivals = green_trips \
    .filter(inside_goldman(col("dropoff_longitude"), col("dropoff_latitude"))) \
    .select("Lpep_dropoff_datetime") \
    .withColumnRenamed("Lpep_dropoff_datetime","dropoff_datetime") \
    .withColumn("location", lit("goldman"))

citigroup_green_arrivals = green_trips \
    .filter(inside_citigroup(col("dropoff_longitude"), col("dropoff_latitude"))) \
    .select("Lpep_dropoff_datetime") \
    .withColumnRenamed("Lpep_dropoff_datetime","dropoff_datetime") \
    .withColumn("location", lit("citigroup"))

goldman_sachs_arrivals = goldman_sachs_yellow_arrivals.union(goldman_sachs_green_arrivals)
citigroup_arrivals =citigroup_yellow_arrivals.union(citigroup_green_arrivals)

# # Union the two streams
arrivals = goldman_sachs_arrivals.union(citigroup_arrivals)

# Define a 10-minute window and count arrivals
ten_min_window = arrivals \
    .withWatermark("dropoff_datetime", "10 minutes") \
    .groupBy(window("dropoff_datetime", "10 minutes"), "location") \
    .count() \
    .withColumnRenamed("count", "current_count") \
    .withColumn("timestamp",(unix_timestamp(col("window.end")) - unix_timestamp(lit("2015-12-01 00:00:00"))) * 100) \
    .orderBy("window","location")

from IPython.display import display
import os

def foreach_batch_function(df, epoch_id):
    goldman_df = df.filter(col("location") == "goldman").rdd.map(tuple).collect()
    citigroup_df = df.filter(col("location") == "citigroup").rdd.map(tuple).collect()

    path = f"output/"
    os.makedirs(path, exist_ok=True)
    
    for i in range(1,len(goldman_df)):
        if goldman_df[i][2] >= 10:
            if goldman_df[i][3] - 60000 == goldman_df[i-1][3]:
                if goldman_df[i][2] >= goldman_df[i-1][2]*2:
                    print(f"The number of arrivals to Goldman Sachs has doubled from {goldman_df[i-1][2]} to {goldman_df[i][2]} at {goldman_df[i][3]}!")
                    display(f"The number of arrivals to Goldman Sachs has doubled from {goldman_df[i-1][2]} to {goldman_df[i][2]} at {goldman_df[i][3]}!")
                    with open(os.path.join(path, f"part-{goldman_df[i][3]}.txt"), 'a') as file:
                        file.write(f"(goldman,({goldman_df[i][2]},{goldman_df[i][3]},{goldman_df[i-1][2]}))\n")
            else:
                display(f"The number of arrivals to Goldman Sachs has doubled from 0 to {goldman_df[i][2]} at {goldman_df[i][3]}!")
                print(f"The number of arrivals to Goldman Sachs has doubled from 0 to {goldman_df[i][2]} at {goldman_df[i][3]}!")
                with open(os.path.join(path, f"part-{goldman_df[i][3]}.txt"), 'a') as file:
                        file.write(f"(goldman,({goldman_df[i][2]},{goldman_df[i][3]},0))\n")

    for i in range(1,len(citigroup_df)):
        if citigroup_df[i][2] >= 10:
            if citigroup_df[i][3] - 60000 == citigroup_df[i-1][3]:
                if citigroup_df[i][2] >= citigroup_df[i-1][2]*2:
                    print(f"The number of arrivals to Citigroup has doubled from {citigroup_df[i-1][2]} to {citigroup_df[i][2]} at {citigroup_df[i][3]}!")
                    display(f"The number of arrivals to Citigroup has doubled from {citigroup_df[i-1][2]} to {citigroup_df[i][2]} at {citigroup_df[i][3]}!")
                    with open(os.path.join(path, f"part-{citigroup_df[i][3]}.txt"), 'a') as file:
                        file.write(f"(citigroup,({citigroup_df[i][2]},{citigroup_df[i][3]},{citigroup_df[i-1][2]}))\n")
            else:
                print(f"The number of arrivals to Citigroup has doubled from {citigroup_df[i-1][2]} to {citigroup_df[i][2]} at {citigroup_df[i][3]}!")
                display(f"The number of arrivals to Citigroup has doubled from 0 to {citigroup_df[i][2]} at {citigroup_df[i][3]}!")
                with open(os.path.join(path, f"part-{citigroup_df[i][3]}.txt"), 'a') as file:
                        file.write(f"(citigroup,({citigroup_df[i][2]},{citigroup_df[i][3]},0))\n")

# Calculate the previous count and detect trends
# previous_count = ten_min_window \
#     .filter("current_count >= 10 ") \
#     .select("location", "current_count", "timestamp", "previous_count")


# previous_count = ten_min_window \
#     .join(withColumn("previous_count", lag("current_count", 1).over(window("location", "unboundedPreceding", "unboundedFollowing"))), on="location", how="left") \
    # .drop("window") \
    # .withColumnRenamed("lag(current_count, 1, null)", "previous_count") \
    # .withColumnRenamed("window", "timestamp") \
    # .filter((col("current_count") >= 10) & (col("current_count") >= 2 * col("previous_count"))) \
    # .select("location", "current_count", "window.start", "previous_count")

# Output the trend detections to console
# console_output = previous_count \
#     .select(when(col("location") == "goldman", concat(lit("The number of arrivals to Goldman Sachs has doubled from "), col("previous_count"), lit(" to "), col("current_count"), lit(" at "), col("timestamp")))
#              .otherwise(concat(lit("The number of arrivals to Citigroup has doubled from "), col("previous_count"), lit(" to "), col("current_count"), lit(" at "), col("timestamp")))
#              .alias("message"))

# # Write the results to the output directory
# # file_output = previous_count \
# #     .select("location", struct("current_count", "timestamp", "previous_count").alias("values"))

# console_query = console_output.writeStream \
#     .outputMode("complete") \
#     .format("console") \
#     .start()

# file_query = file_output.writeStream \
#     .outputMode("append") \
#     .format("json") \
#     .option("checkpointLocation", "checkpoint") \
#     .option("path", "output") \
#     .start()

# console_query.awaitTermination()
# file_query.awaitTermination()

stream_query = ten_min_window.writeStream \
                         .format("console") \
                         .outputMode("Complete") \
                         .foreachBatch(foreach_batch_function) \
                         .start()
# stream_query1 = ten_min_window.writeStream \
#                          .format("console") \
#                          .outputMode("append") \
#                          .start()

stream_query.awaitTermination()
# stream_query1.awaitTermination()


24/05/22 19:41:04 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-06727a3d-762c-4ab0-b2da-c6ac104c283a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/05/22 19:41:04 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

'The number of arrivals to Citigroup has doubled from 3 to 12 at 3240000!'

'The number of arrivals to Citigroup has doubled from 3 to 10 at 5100000!'