### Install dependencies

In [1]:
!pip install shapely

Collecting shapely
  Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
Installing collected packages: shapely
Successfully installed shapely-2.0.7


In [2]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
                    .appName('project1')
                    .enableHiveSupport()
                    .getOrCreate()
        )

## Load the data

In [3]:
# Real taxi data
taxi_df = (
    spark.read
         .option("header", "true")
         .option("inferSchema", "true")
         .csv("input/trip_data/*.csv")
)

display(taxi_df)

boroughs_df = (spark.read.option("inferSchema", True).option("multiline", True).json("input/nyc-boroughs.geojson"))
display(boroughs_df)

DataFrame[medallion: string, hack_license: string, vendor_id: string, rate_code: string, store_and_fwd_flag: string, pickup_datetime: string, dropoff_datetime: string, passenger_count: string, trip_time_in_secs: string, trip_distance: string, pickup_longitude: string, pickup_latitude: string, dropoff_longitude: string, dropoff_latitude: string]

DataFrame[features: array<struct<geometry:struct<coordinates:array<array<array<double>>>,type:string>,id:bigint,properties:struct<@id:string,borough:string,boroughCode:bigint>,type:string>>, type: string]

## Enriching dataset with borough names

In [4]:
from shapely.geometry import Polygon
from pyspark.sql.functions import explode, col, udf

# udfs for shapely polygon creation and area calculation
def create_polygon(coordinates):
    return Polygon(coordinates[0]).wkt # wkt format can be deserialzed

def calculate_area(coordinates):
    return Polygon(coordinates[0]).area
    
create_polygon_udf = udf(create_polygon)
calculate_area_udf = udf(calculate_area)

In [5]:
from pyspark.sql.types import DoubleType

# explode the features array
boroughs_features_df = boroughs_df.select(explode(col("features")).alias("feature"))

boroughs_features_df.head()

# dataframe enriched with polygon and area
borough_geo_df = (
    boroughs_features_df.select(
        col("feature.properties.borough").alias("borough"),
        col("feature.properties.boroughCode").alias("boroughCode"),
        col("feature.geometry.coordinates").alias("coordinates")
    )
    .withColumn("polygon", create_polygon_udf(col("coordinates")))
    .withColumn("area", calculate_area_udf(col("coordinates")))
)

# cast area to double
borough_geo_df = borough_geo_df.withColumn("area", col("area").cast(DoubleType()))

In [6]:
from pyspark.sql.functions import desc, asc
from shapely.wkt import dumps

# sort the dataframe by boroughCode and area
boroughs_sorted = borough_geo_df.orderBy("boroughCode", desc("area"))
boroughs_sorted.select("borough", "area").show(20, truncate=False)

# broadcast the sorted boroughs dataframe
borough_bd = spark.sparkContext.broadcast(boroughs_sorted.collect())

+---------+---------------------+
|borough  |area                 |
+---------+---------------------+
|Manhattan|0.005859077996035753 |
|Manhattan|2.3271655856762013E-4|
|Manhattan|7.6037752599342E-5   |
|Manhattan|6.23157479510608E-5  |
|Manhattan|3.2658591272044954E-5|
|Manhattan|1.1828883137677094E-5|
|Manhattan|6.143638903459381E-6 |
|Manhattan|3.3831273674444417E-6|
|Manhattan|2.858823502476497E-6 |
|Manhattan|2.393654308790746E-6 |
|Manhattan|2.3345540772235924E-6|
|Manhattan|2.2627340976761147E-6|
|Manhattan|2.2465297475388565E-6|
|Manhattan|2.24116737819868E-6  |
|Manhattan|2.2094455927089308E-6|
|Manhattan|2.0486362960502745E-6|
|Manhattan|1.3300494774255711E-6|
|Manhattan|1.1111770551654054E-6|
|Manhattan|2.354033827718773E-7 |
|Manhattan|2.0929022926724522E-7|
+---------+---------------------+
only showing top 20 rows



In [7]:
from pyspark.sql.types import StringType
from shapely.geometry import Point
from shapely.wkt import loads

# udf for getting borough
def get_borough(lat, long):
    point = Point(long, lat)
    for row in borough_bd.value:
        if loads(row["polygon"]).contains(point):
            return row["borough"]
    return "unknown"

get_borough_udf = udf(get_borough)   

In [8]:
taxi_df = taxi_df.withColumn("pickup_borough", get_borough_udf(col("pickup_latitude"), col("pickup_longitude")))
taxi_df = taxi_df.withColumn("dropoff_borough", get_borough_udf(col("dropoff_latitude"), col("dropoff_longitude")))

# taxi_df.show()
taxi_df.select("pickup_borough", "dropoff_borough", "medallion", "pickup_datetime", "dropoff_datetime").show(20)

+--------------+---------------+--------------------+-------------------+-------------------+
|pickup_borough|dropoff_borough|           medallion|    pickup_datetime|   dropoff_datetime|
+--------------+---------------+--------------------+-------------------+-------------------+
|     Manhattan|      Manhattan|89D227B655E5C82AE...|2013-01-01 15:11:48|2013-01-01 15:18:10|
|     Manhattan|      Manhattan|0BD7C8F5BA12B88E0...|2013-01-06 00:18:35|2013-01-06 00:22:54|
|     Manhattan|      Manhattan|0BD7C8F5BA12B88E0...|2013-01-05 18:49:41|2013-01-05 18:54:23|
|     Manhattan|      Manhattan|DFD2202EE08F7A8DC...|2013-01-07 23:54:15|2013-01-07 23:58:20|
|     Manhattan|      Manhattan|DFD2202EE08F7A8DC...|2013-01-07 23:25:03|2013-01-07 23:34:24|
|     Manhattan|      Manhattan|20D9ECB2CA0767CF7...|2013-01-07 15:27:48|2013-01-07 15:38:37|
|     Manhattan|      Manhattan|496644932DF393260...|2013-01-08 11:01:15|2013-01-08 11:08:14|
|     Manhattan|         Queens|0B57B9633A2FECD3D...|2013-01

## Data cleansing

In [9]:
from pyspark.sql.functions import unix_timestamp
# add columns for pickup and dropoff times in unix timestamps 

time_format = "yyyy-MM-dd HH:mm:ss"

taxi_df = (taxi_df
           .withColumn("pickup_timestamp", unix_timestamp(col("pickup_datetime"), time_format))
           .withColumn("dropoff_timestamp", unix_timestamp(col("dropoff_datetime"), time_format))
           .withColumn("duration", col("dropoff_timestamp") - col("pickup_timestamp"))
          )

taxi_df.select("pickup_timestamp", "dropoff_timestamp", "pickup_datetime", "dropoff_datetime", "duration", "pickup_borough", "dropoff_borough", "medallion").show(20)

+----------------+-----------------+-------------------+-------------------+--------+--------------+---------------+--------------------+
|pickup_timestamp|dropoff_timestamp|    pickup_datetime|   dropoff_datetime|duration|pickup_borough|dropoff_borough|           medallion|
+----------------+-----------------+-------------------+-------------------+--------+--------------+---------------+--------------------+
|      1357053108|       1357053490|2013-01-01 15:11:48|2013-01-01 15:18:10|     382|     Manhattan|      Manhattan|89D227B655E5C82AE...|
|      1357431515|       1357431774|2013-01-06 00:18:35|2013-01-06 00:22:54|     259|     Manhattan|      Manhattan|0BD7C8F5BA12B88E0...|
|      1357411781|       1357412063|2013-01-05 18:49:41|2013-01-05 18:54:23|     282|     Manhattan|      Manhattan|0BD7C8F5BA12B88E0...|
|      1357602855|       1357603100|2013-01-07 23:54:15|2013-01-07 23:58:20|     245|     Manhattan|      Manhattan|DFD2202EE08F7A8DC...|
|      1357601103|       135760166

In [None]:
# filter out the trips with invalid duration
print(taxi_df.count())

max_trip_duration = 4 * 60 * 60 # 4 hours

filtered_taxi_df = taxi_df.filter((col("duration") > 0) & (col("duration") <= max_trip_duration))
print(filtered_taxi_df.count())


## Utilization Query

In [12]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, sum as _sum, when

# partition data by medallion
window_spec = Window.partitionBy("medallion").orderBy("pickup_timestamp")

# add previous dropoff time as column
taxi_df = taxi_df.withColumn("prev_dropoff_timestamp", lag("dropoff_timestamp").over(window_spec))

# add idle time column
taxi_df = taxi_df.withColumn("idle_time", col("pickup_timestamp") - col("prev_dropoff_timestamp"))

session_threshold = 4 * 60 * 60
# if idle time is more than 4 hours, count consider it's a new session
taxi_df = taxi_df.withColumn(
    "idle_time",
    when(col("idle_time") <= session_threshold, col("idle_time")).otherwise(0)
)

# group by taxi ID and get the total idle time
utilization_df = (
    taxi_df
    .groupBy("medallion")
    .agg(_sum("duration").alias("busy_time_sum"),
         _sum("idle_time").alias("idle_time_sum")
    )
)

utilization_df = utilization_df.withColumn(
    "utilization_rate",
    col("busy_time_sum") / (col("busy_time_sum") + col("idle_time_sum"))
)

utilization_df.show(50)

+--------------------+-------------+-------------+-------------------+
|           medallion|busy_time_sum|idle_time_sum|   utilization_rate|
+--------------------+-------------+-------------+-------------------+
|000318C2E3E638158...|        13920|        17400| 0.4444444444444444|
|002E3B405B6ABEA23...|        10260|        16140| 0.3886363636363636|
|0030AD2648D81EE87...|         1980|          720| 0.7333333333333333|
|0036961468659D0BF...|        11700|        19740|0.37213740458015265|
|0038EF45118925A51...|        10920|        15120|0.41935483870967744|
|0053334C798EC6C8E...|         7920|        22440| 0.2608695652173913|
|005DED7D6E6C45441...|        11460|        11760| 0.4935400516795866|
|005F00B38F46E2100...|        18600|        42180| 0.3060217176702863|
|00790C7BAD30B7A9E...|        12360|        25320|0.32802547770700635|
|0094A03FFE6BAFBE0...|        10680|         5400|  0.664179104477612|
|009D3CCA83486B03F...|        19620|        40920| 0.3240832507433102|
|009E6

## Average time for taxi to find next fare per destination borough

In [14]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lead, avg

# window specification for each taxi, ordered by dropoff time
window_spec = Window.partitionBy("medallion").orderBy("dropoff_timestamp")

# get the pickup time of the next trip for the same taxi
taxi_df = taxi_df.withColumn("next_pickup_timestamp", lead("pickup_timestamp").over(window_spec))

# compute idle time between trips
taxi_df = taxi_df.withColumn("time_to_next_fare", col("next_pickup_timestamp") - col("dropoff_timestamp"))

# compute the average idle time per drop-off borough
avg_time_per_borough = (
    taxi_df.groupBy("dropoff_borough")
    .agg(avg("time_to_next_fare").alias("avg_time_to_next_fare"))
)

# results, we left records with unknown pick up/drop off borough (means ride was from outside of NYC or from NYC to outside city)
avg_time_per_borough.show(truncate=False)


+---------------+---------------------+
|dropoff_borough|avg_time_to_next_fare|
+---------------+---------------------+
|Queens         |6368.423432682425    |
|unknown        |12206.935332708528   |
|Brooklyn       |6554.840325610519    |
|Staten Island  |13935.0              |
|Manhattan      |2048.9211563256895   |
|Bronx          |4973.719008264463    |
+---------------+---------------------+

