In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
175,application_1765289937462_0172,pyspark,busy,Link,Link,,
178,application_1765289937462_0175,pyspark,idle,Link,Link,,
180,application_1765289937462_0177,pyspark,idle,Link,Link,,
181,application_1765289937462_0178,pyspark,idle,Link,Link,,
182,application_1765289937462_0179,pyspark,idle,Link,Link,,
184,application_1765289937462_0181,pyspark,idle,Link,Link,,
185,application_1765289937462_0182,pyspark,idle,Link,Link,,
186,application_1765289937462_0183,pyspark,idle,Link,Link,,
187,application_1765289937462_0184,pyspark,idle,Link,Link,,
188,application_1765289937462_0185,pyspark,busy,Link,Link,,


In [2]:
from sedona.spark import *
from pyspark.sql import SparkSession


from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import udf, year, avg, count, concat, lit, round, rank, col

spark = SparkSession \
    .builder \
    .appName("query 4 execution") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
189,application_1765289937462_0186,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
crime_df1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", header = True, inferSchema = True)
crime_df2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", header = True, inferSchema = True)
crime_df = crime_df1.union(crime_df2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
stations_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Police_Stations.csv", header = True, inferSchema = True)
stations_df.head()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(X=-118.289241553, Y=33.7576608970001, FID=1, DIVISION='HARBOR', LOCATION='2175 JOHN S. GIBSON BLVD.', PREC=5)

In [5]:
from pyspark.sql.window import Window
import time

start = time.time()
# Window: nearest station per crime
window = Window.partitionBy("DR_NO").orderBy("distance")

# Filter crimes (drop Null Island etc.)
filtered_crimes_df = (
    crime_df
    .filter(col("LAT").isNotNull() & col("LON").isNotNull())
    .filter(~((col("LAT") == 0) & (col("LON") == 0)))   # Null Island
    .select("DR_NO", "LAT", "LON")
)
def get_distance_udf(lat1_col, lon1_col, lat2_col, lon2_col):
    # Sedona ST_Point(x, y) = (longitude, latitude)
    p1 = ST_Point(lon1_col.cast("double"), lat1_col.cast("double"))
    p2 = ST_Point(lon2_col.cast("double"), lat2_col.cast("double"))
    return ST_DistanceSphere(p1, p2) / 1000.0  # km

end = time.time()
print("Execution time:", end - start, "seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Execution time: 0.05773162841796875 seconds

In [8]:
start = time.time()
# 2. Cross join with stations, compute distance to each station
# adjust "division" to the actual column name if needed (e.g. "DIVISION")
crimes_stations_dist_df = (
    filtered_crimes_df
    .crossJoin(
        stations_df.select(
            col("division"),  # or col("DIVISION").alias("division")
            col("Y"),
            col("X")
        )
    )
    .withColumn(
        "distance",
        get_distance_udf(col("LAT"), col("LON"), col("Y"), col("X"))
    )
) # Creates the crimes // police stations cartesian product 


# 3. Keep only nearest station per crime
nearest_station_df = (
    crimes_stations_dist_df
    .withColumn("distance_rank", rank().over(window))
    .filter(col("distance_rank") == 1)
    .select("division", "distance")
)

# 4. Aggregate per division: count + average distance, sort by count desc
division_stats_df = (
    nearest_station_df
    .groupBy("division")
    .agg(
        avg("distance").alias("average_distance_km"),
        count("*").alias("#")
    )
    .withColumn(
        "average_distance_km",
        round(col("average_distance_km"), 3)
    )
    .orderBy(col("#").desc())
)


division_stats_df.show(truncate=False)

end = time.time()
print("Execution time:", end - start, "seconds")

# # Save output
# division_stats_df.write.csv(
#     "./output/query4/query4-DataFrame.csv",
#     header=True,
#     mode="overwrite"
# )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+-------------------+------+
|division        |average_distance_km|#     |
+----------------+-------------------+------+
|HOLLYWOOD       |2.077              |225515|
|VAN NUYS        |2.953              |211130|
|SOUTHWEST       |2.191              |189565|
|WILSHIRE        |2.593              |187061|
|77TH STREET     |1.717              |172558|
|OLYMPIC         |1.725              |172353|
|NORTH HOLLYWOOD |2.643              |168655|
|PACIFIC         |3.853              |162514|
|CENTRAL         |0.993              |154952|
|SOUTHEAST       |2.422              |153746|
|RAMPART         |1.535              |153690|
|TOPANGA         |3.298              |141070|
|WEST VALLEY     |3.039              |139820|
|FOOTHILL        |4.251              |135381|
|HARBOR          |3.702              |127370|
|HOLLENBECK      |2.677              |116558|
|WEST LOS ANGELES|2.79               |116308|
|NEWTON          |1.635              |111628|
|NORTHEAST       |3.623           