# **Project 1 – NYC Taxi Analysis**


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    unix_timestamp,
    udf,
    when,
    sum as spark_sum,
    count,
    avg,
    lag,
    lit
)
from pyspark.sql.window import Window
from shapely.geometry import shape, Point
from pyspark.sql.types import StringType
import json

spark = SparkSession.builder.appName("NYC Taxi Analysis").getOrCreate()

df = spark.read.csv("/content/Sample NYC Data.csv", header=True, inferSchema=True)

# df = df.limit(100) # Limit for testing purposes

with open("/content/nyc-boroughs.geojson") as f:
    boroughs = json.load(f)

# Convert borough boundaries to Shapely polygons
borough_shapes = [(feature["properties"]["borough"], shape(feature["geometry"])) for feature in boroughs["features"]]

# Define UDF to find borough for a given coordinate
def get_borough(lat, lon):
    point = Point(lon, lat)
    for borough, polygon in borough_shapes:
        if polygon.contains(point):
            return borough
    return "Unknown"

borough_udf = udf(get_borough, StringType())

# Enrich taxi dataset with borough information
df = df.withColumn("pickup_borough", borough_udf(col("pickup_latitude"), col("pickup_longitude"))) \
       .withColumn("dropoff_borough", borough_udf(col("dropoff_latitude"), col("dropoff_longitude")))

# Convert to unix timestamp
df = df.withColumn("pickup_ts", unix_timestamp("pickup_datetime", "dd-MM-yy HH:mm")) \
      .withColumn("dropoff_ts", unix_timestamp("dropoff_datetime", "dd-MM-yy HH:mm"))

# Compute trip duration
df = df.withColumn("trip_duration", col("dropoff_ts") - col("pickup_ts"))

# Remove outliers (negative duration or longer than 4 hours)
df = df.filter((col("trip_duration") > 0) & (col("trip_duration") <= 14400))

# Compute idle time for each taxi
taxi_window = Window.partitionBy("medallion").orderBy("pickup_ts")
df = df.withColumn("prev_dropoff", lag("dropoff_ts").over(taxi_window)) \
       .withColumn("idle_time", when((col("pickup_ts") - col("prev_dropoff")) <= 14400, col("pickup_ts") - col("prev_dropoff")).otherwise(lit(0)))

# Aggregate utilization per taxi
taxi_utilization = df.groupBy("medallion").agg((spark_sum("trip_duration") / (spark_sum("trip_duration") + spark_sum("idle_time"))).alias("utilization"))

# Compute average time to find next fare per borough
taxi_next_fare = df.groupBy("dropoff_borough").agg(avg("idle_time").alias("avg_time_to_next_fare"))

# Compute trip counts
same_borough_trips = df.filter(col("pickup_borough") == col("dropoff_borough")).groupBy("pickup_borough").agg(count("*").alias("same_borough_trips"))
diff_borough_trips = df.filter(col("pickup_borough") != col("dropoff_borough")).groupBy("pickup_borough", "dropoff_borough").agg(count("*").alias("diff_borough_trips"))

# Show results – for testing purposes
# taxi_utilization.show()
# taxi_next_fare.show()
# same_borough_trips.show()
# diff_borough_trips.show()
# total_zero_idle_time = df.filter(col("idle_time") == 0).count()
# print(total_zero_idle_time)

taxi_next_fare_renamed = taxi_next_fare.withColumnRenamed("dropoff_borough", "dropoff_borough_nf")

df_final = df.select("medallion", "pickup_borough", "dropoff_borough", "trip_duration", "idle_time") \
             .join(taxi_utilization, "medallion", "left") \
             .join(taxi_next_fare_renamed, df["dropoff_borough"] == taxi_next_fare_renamed["dropoff_borough_nf"], "left") \
             .join(same_borough_trips, "pickup_borough", "left") \
             .join(diff_borough_trips, ["pickup_borough", "dropoff_borough"], "left") \
             .drop("dropoff_borough_nf")

df_final.write.mode("overwrite").parquet("results_parquet")