# Environment prep

In [1]:
!pip install pyspark



In [2]:
!pip install kafka-python



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window, udf, count, rank, lit, when, struct, collect_list, median, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType
import math
import time

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("BDM_project_2") \
    .getOrCreate()

print("Spark version:", spark.version)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/30 23:08:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark version: 3.5.4


In [4]:
# Define schema for taxi data
schema = StructType([
    StructField("medallion", StringType(), True),
    StructField("hack_license", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("trip_time_in_secs", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("payment_type", StringType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("surcharge", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True)
])

# Data cleaning (Query 0)

In [5]:
data_path = "./sorted_data_1gb"
df = spark.read.csv(data_path, schema = schema, header = True)

cleaned_df = df.filter(
    (col("pickup_longitude").isNotNull()) &
    (col("pickup_latitude").isNotNull()) &
    (col("dropoff_longitude").isNotNull()) &
    (col("dropoff_latitude").isNotNull()) &
    (col("pickup_longitude") != 0) &
    (col("pickup_latitude") != 0) &
    (col("dropoff_longitude") != 0) &
    (col("dropoff_latitude") != 0) &
    (col("medallion").isNotNull()) &
    (col("hack_license").isNotNull()) &
    (col("trip_time_in_secs") > 0) &
    (col("trip_distance") > 0) &
    (col("fare_amount") > 0)
)

print("Query 0 - Sample of Cleaned Data:")
cleaned_df.show(5)

Query 0 - Sample of Cleaned Data:
+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+
|           medallion|        hack_license|    pickup_datetime|   dropoff_datetime|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|
+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+
|FF0622D5D4D01EB3C...|E1E2DD8B97AB23CC6...|2013-11-15 21:39:56|2013-11-15 21:45:19|              322|          1.1|        -73.9776|      40.786705|       -73.981133|       40.774364|         CRD|        6.0|     

In [6]:
def get_grid_cell(lat, lon, cell_size_m):
    if lat is None or lon is None:
        return None
    origin_lat, origin_lon = 41.474937, -74.913585
    lat_per_cell = cell_size_m / 111000.0
    lon_per_cell = cell_size_m / (111000.0 * math.cos(math.radians(origin_lat)))
    lat_offset = math.floor((lat - origin_lat) / lat_per_cell) + 1
    lon_offset = math.floor((lon - origin_lon) / lon_per_cell) + 1
    return f"{lat_offset}.{lon_offset}"

grid_cell_500_udf = udf(lambda lat, lon: get_grid_cell(lat, lon, 500), StringType())
grid_cell_250_udf = udf(lambda lat, lon: get_grid_cell(lat, lon, 250), StringType())

# Grid transformation
df_grid = cleaned_df \
    .withColumn("pickup_grid_500", grid_cell_500_udf(col("pickup_latitude"), col("pickup_longitude"))) \
    .withColumn("dropoff_grid_500", grid_cell_500_udf(col("dropoff_latitude"), col("dropoff_longitude"))) \
    .withColumn("pickup_grid_250", grid_cell_250_udf(col("pickup_latitude"), col("pickup_longitude"))) \
    .withColumn("dropoff_grid_250", grid_cell_250_udf(col("dropoff_latitude"), col("dropoff_longitude"))) \
    .filter(col("pickup_grid_500").isNotNull() & col("dropoff_grid_500").isNotNull() &
            col("pickup_grid_250").isNotNull() & col("dropoff_grid_250").isNotNull())

print("Grid Cell Sample:")
df_grid.select("pickup_grid_500", "dropoff_grid_500", "pickup_grid_250", "dropoff_grid_250").show(5)

Grid Cell Sample:
+---------------+----------------+---------------+----------------+
|pickup_grid_500|dropoff_grid_500|pickup_grid_250|dropoff_grid_250|
+---------------+----------------+---------------+----------------+
|       -152.156|        -155.156|       -305.312|        -311.311|
|       -155.160|        -162.154|       -310.320|        -324.308|
|       -156.159|        -168.159|       -313.318|        -337.318|
|       -169.162|        -168.158|       -338.323|        -337.316|
|       -154.162|        -155.160|       -308.323|        -310.319|
+---------------+----------------+---------------+----------------+
only showing top 5 rows



# Query 1

Part 1

In [7]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Define a fixed reference timestamp
reference_time = df_grid.agg(F.max("dropoff_datetime")).collect()[0][0]

# Filter the data for trips that occurred in the last 30 minutes relative to the reference time
time_limit = F.lit(reference_time) - F.expr("INTERVAL 30 MINUTES")

# Filter the DataFrame to include only trips completed in the last 30 minutes from the reference time
df_recent = df_grid.filter(F.col("dropoff_datetime") >= time_limit)

# Perform the groupBy aggregation to find frequent routes within the last 30 minutes
route_counts = df_recent \
    .groupBy("pickup_grid_500", "dropoff_grid_500") \
    .agg(F.count("*").alias("ride_count"))

# Rank routes by their frequency
window_spec = Window.orderBy(F.col("ride_count").desc())

# Apply the window function to rank the routes
top_10_routes_static = route_counts \
    .withColumn("rank", F.rank().over(window_spec)) \
    .filter(F.col("rank") <= 10)

# Select only the relevant columns for output: start cell, end cell, and number of rides
top_10_routes_static = top_10_routes_static.select(
    F.col("pickup_grid_500").alias("start_cell"),
    F.col("dropoff_grid_500").alias("end_cell"),
    "ride_count"
)

# Show the top 10 frequent routes
top_10_routes_static.show(10)

25/03/30 23:09:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.

+----------+--------+----------+
|start_cell|end_cell|ride_count|
+----------+--------+----------+
|  -167.154|-165.155|         1|
|  -168.155|-157.154|         1|
|  -160.176|-161.174|         1|
|  -158.158|-160.152|         1|
|  -160.154|-158.154|         1|
|  -154.161|-151.163|         1|
|  -167.153|-179.155|         1|
|  -162.152|-117.178|         1|
|  -154.161|-155.159|         1|
|  -154.156|-160.157|         1|
+----------+--------+----------+
only showing top 10 rows



25/03/30 23:09:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

Part 2

In [8]:
df_grid = df_grid.withColumn("event_time", col("dropoff_datetime"))

# Apply watermarking to track events dynamically
windowed_routes = df_grid \
    .withWatermark("event_time", "30 minutes") \
    .groupBy(window("event_time", "30 minutes"), "pickup_grid_500", "dropoff_grid_500") \
    .agg(count("*").alias("ride_count"))

# Rank routes dynamically within each 30-minute window
window_spec = Window.partitionBy("window").orderBy(col("ride_count").desc())

top_routes = windowed_routes \
    .withColumn("rank", rank().over(window_spec)) \
    .filter(col("rank") <= 10)

# Processing delay
top_routes = top_routes.withColumn("delay", (current_timestamp() - col("window.start")).cast("long"))

final_result = top_routes.select(
    col("window.start").alias("pickup_datetime"),
    col("window.end").alias("dropoff_datetime"),
    col("pickup_grid_500").alias("start_cell"),
    col("dropoff_grid_500").alias("end_cell"),
    col("delay")
)

final_result.show(10)



+-------------------+-------------------+----------+--------+---------+
|    pickup_datetime|   dropoff_datetime|start_cell|end_cell|    delay|
+-------------------+-------------------+----------+--------+---------+
|2013-01-01 04:00:00|2013-01-01 04:30:00|  -162.154|-167.153|386363342|
|2013-01-01 04:00:00|2013-01-01 04:30:00|  -163.155|-169.151|386363342|
|2013-01-01 04:00:00|2013-01-01 04:30:00|  -158.154|-155.160|386363342|
|2013-01-01 04:00:00|2013-01-01 04:30:00|  -150.161|-149.161|386363342|
|2013-01-01 04:00:00|2013-01-01 04:30:00|  -152.162|-152.160|386363342|
|2013-01-01 04:00:00|2013-01-01 04:30:00|  -153.156|-159.154|386363342|
|2013-01-01 04:00:00|2013-01-01 04:30:00|  -155.161|-154.162|386363342|
|2013-01-01 04:00:00|2013-01-01 04:30:00|  -156.153|-167.153|386363342|
|2013-01-01 04:00:00|2013-01-01 04:30:00|  -156.155|-148.161|386363342|
|2013-01-01 04:00:00|2013-01-01 04:30:00|  -156.159|-157.158|386363342|
+-------------------+-------------------+----------+--------+---

                                                                                

# Query 2

Part 1

In [9]:
from pyspark.sql import Window
from pyspark.sql.functions import col, median, count, when, rank

# Add profit column and 250m grid cells
profit_df = cleaned_df \
    .withColumn("profit", col("fare_amount") + col("tip_amount")) \
    .withColumn("pickup_grid_250", grid_cell_250_udf(col("pickup_latitude"), col("pickup_longitude"))) \
    .withColumn("dropoff_grid_250", grid_cell_250_udf(col("dropoff_latitude"), col("dropoff_longitude"))) \
    .filter(col("pickup_grid_250").isNotNull() & col("dropoff_grid_250").isNotNull())

# Step 1: Calculate median profit per pickup cell
median_profit = profit_df \
    .groupBy("pickup_grid_250") \
    .agg(median("profit").alias("median_profit"))

# Step 2: Estimate empty taxis (simplified for static data)
# For each medallion, count dropoffs; assume a taxi is "empty" if it has no subsequent pickup within 30 minutes
# In static data, we'll approximate by counting unique dropoffs per cell without time tracking
empty_taxis = profit_df \
    .groupBy("medallion", "dropoff_grid_250", "dropoff_datetime") \
    .agg(count("*").alias("trip_count")) \
    .groupBy("dropoff_grid_250") \
    .agg(count("medallion").alias("empty_taxis"))  # Simplified: counts unique medallions per dropoff cell

# Step 3: Compute profitability
profitability = median_profit \
    .join(empty_taxis, median_profit.pickup_grid_250 == empty_taxis.dropoff_grid_250, "left_outer") \
    .na.fill({"empty_taxis": 0}) \
    .withColumn("profitability",
                when(col("empty_taxis") > 0, col("median_profit") / col("empty_taxis"))
                .otherwise(col("median_profit"))) \
    .select(col("pickup_grid_250").alias("cell_id"), "empty_taxis", "median_profit", "profitability")

# Step 4: Rank and get top 10
window_spec_profit = Window.orderBy(col("profitability").desc())
top_10_profit_static = profitability \
    .withColumn("rank", rank().over(window_spec_profit)) \
    .filter(col("rank") <= 10) \
    .select("cell_id", "empty_taxis", "median_profit", "profitability", "rank")

# Show results
print("Query 2 Part 1 - Top 10 Profitable Areas (Static):")
top_10_profit_static.show(10)

25/03/30 23:09:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Query 2 Part 1 - Top 10 Profitable Areas (Static):


25/03/30 23:09:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/30 23:09:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/30 23:09:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/30 23:09:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/30 23:09:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/30 23:09:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/30 23:09:20 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/03/30 23:09:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:2

+--------+-----------+-------------+-------------+----+
| cell_id|empty_taxis|median_profit|profitability|rank|
+--------+-----------+-------------+-------------+----+
|-128.590|          0|       504.77|       504.77|   1|
| -504.95|          0|        360.0|        360.0|   2|
|-386.228|          0|        300.0|        300.0|   3|
|-256.372|          0|        296.4|        296.4|   4|
|-173.480|          1|        288.0|        288.0|   5|
|-365.113|          1|        288.0|        288.0|   5|
| -387.75|          0|        273.0|        273.0|   7|
|-157.512|          0|        270.0|        270.0|   8|
|-332.283|          1|        260.0|        260.0|   9|
|-146.495|          0|        250.0|        250.0|  10|
+--------+-----------+-------------+-------------+----+



25/03/30 23:09:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

Testing Q2P1 with time windows

In [10]:
from pyspark.sql import Window
from pyspark.sql.functions import col, median, count, when, rank, max
from pyspark.sql import functions as F

# Add profit column and 250m grid cells
profit_df = cleaned_df \
    .withColumn("profit", col("fare_amount") + col("tip_amount")) \
    .withColumn("pickup_grid_250", grid_cell_250_udf(col("pickup_latitude"), col("pickup_longitude"))) \
    .withColumn("dropoff_grid_250", grid_cell_250_udf(col("dropoff_latitude"), col("dropoff_longitude"))) \
    .filter(col("pickup_grid_250").isNotNull() & col("dropoff_grid_250").isNotNull())

# Step 1: Get the latest timestamp from the data
latest_time = profit_df.agg(max("dropoff_datetime")).collect()[0][0]

# Step 2: Filter trips that started in the last 15 minutes relative to the latest timestamp
profit_df_filtered = profit_df.filter(col("pickup_datetime") >= (F.lit(latest_time) - F.expr("INTERVAL 15 MINUTES")))

# Step 3: Calculate the median profit per pickup grid cell (within the last 15 minutes)
median_profit = profit_df_filtered \
    .groupBy("pickup_grid_250") \
    .agg(F.expr("percentile_approx(profit, 0.5)").alias("median_profit"))

# Step 4: Estimate empty taxis (drop-offs that occurred in the last 30 minutes with no subsequent pickup)
empty_taxis = profit_df \
    .filter(col("dropoff_datetime") >= (F.lit(latest_time) - F.expr("INTERVAL 30 MINUTES"))) \
    .withColumn("next_pickup", F.lead("pickup_datetime").over(Window.partitionBy("medallion").orderBy("dropoff_datetime"))) \
    .filter(F.col("next_pickup").isNull() | (F.col("next_pickup") > F.col("dropoff_datetime"))) \
    .groupBy("dropoff_grid_250") \
    .agg(F.countDistinct("medallion").alias("empty_taxis"))

# Step 5: Compute profitability (median_profit / empty_taxis)
profitability = median_profit \
    .join(empty_taxis, median_profit.pickup_grid_250 == empty_taxis.dropoff_grid_250, "left_outer") \
    .na.fill({"empty_taxis": 0}) \
    .withColumn("profitability", 
                F.when(col("empty_taxis") > 0, col("median_profit") / col("empty_taxis"))
                 .otherwise(col("median_profit"))) \
    .select(col("pickup_grid_250").alias("cell_id"), "empty_taxis", "median_profit", "profitability")

# Step 6: Rank by profitability and select the top 10 most profitable areas
window_spec_profit = Window.orderBy(F.col("profitability").desc())

top_10_profit_static = profitability \
    .withColumn("rank", rank().over(window_spec_profit)) \
    .filter(F.col("rank") <= 10) \
    .select("cell_id", "empty_taxis", "median_profit", "profitability", "rank")

print("Query 2 Part 1 - Top 10 Profitable Areas (Static):")
top_10_profit_static.show(10)

25/03/30 23:09:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Query 2 Part 1 - Top 10 Profitable Areas (Static):


25/03/30 23:09:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/30 23:09:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.

+-------+-----------+-------------+-------------+----+
|cell_id|empty_taxis|median_profit|profitability|rank|
+-------+-----------+-------------+-------------+----+
+-------+-----------+-------------+-------------+----+



                                                                                