In [4]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "4",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2",
        "spark.driver.memory": "4g"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3071,application_1732639283265_3027,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2944,application_1732639283265_2903,pyspark,idle,Link,Link,,
2966,application_1732639283265_2924,pyspark,idle,Link,Link,,
2971,application_1732639283265_2929,pyspark,idle,Link,Link,,
2975,application_1732639283265_2933,pyspark,idle,Link,Link,,
2976,application_1732639283265_2934,pyspark,idle,Link,Link,,
3000,application_1732639283265_2958,pyspark,idle,Link,Link,,
3005,application_1732639283265_2963,pyspark,idle,Link,Link,,
3010,application_1732639283265_2968,pyspark,idle,Link,Link,,
3017,application_1732639283265_2975,pyspark,idle,Link,Link,,
3019,application_1732639283265_2977,pyspark,idle,Link,Link,,


In [5]:
# Spark SQL code
from sedona.spark import *
from sedona.sql import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col, row_number, count, avg
from pyspark.sql.window import Window

# To log our application's execution time:
import time

start_time = time.time()

spark = SparkSession.builder \
    .appName("query5_1") \
    .getOrCreate()

sedona = SedonaContext.create(spark)

police_schema = StructType([
    StructField("X", FloatType(), True),
    StructField("Y", FloatType(), True),
    StructField("FID", IntegerType(), True),
    StructField("DIVISION", StringType(), True),
    StructField("LOCATION", StringType(), True),
    StructField("PREC", IntegerType(), True)
])
# Start timing
start_time = time.time()
police_station_df = spark.read.format('csv') \
                .options(header='true') \
                .schema(police_schema) \
                .load("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv")

crimes_schema = StructType([
    StructField("DR_NO", StringType(), True),
    StructField("Date_Rptd", StringType(), True),
    StructField("DATE_OCC", StringType(), True),
    StructField("TIME_OCC", StringType(), True),
    StructField("AREA", StringType(), True),
    StructField("AREA_NAME", StringType(), True),
    StructField("Rpt_Dist_No", StringType(), True),
    StructField("Part_1-2", StringType(), True),
    StructField("Crm_Cd", StringType(), True),
    StructField("Crm_Cd_Desc", StringType(), True),
    StructField("Mocodes", StringType(), True),
    StructField("Vict_Age", StringType(), True),
    StructField("Vict_Sex", StringType(), True),
    StructField("Vict_Descent", StringType(), True),
    StructField("Premis_Cd", StringType(), True),
    StructField("Premis_Desc", StringType(), True),
    StructField("Weapon_Used_Cd", StringType(), True),
    StructField("Weapon_Desc", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Status_Desc", StringType(), True),
    StructField("Crm_Cd_1", StringType(), True),
    StructField("Crm_Cd_2", StringType(), True),
    StructField("Crm_Cd_3", StringType(), True),
    StructField("Crm_Cd_4", StringType(), True),
    StructField("LOCATION", StringType(), True),
    StructField("Cross_Street", StringType(), True),
    StructField("LAT", StringType(), True),
    StructField("LON", StringType(), True)
])

crimes_df1 = spark.read.format('csv') \
                .options(header='false') \
                .schema(crimes_schema) \
                .load("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv")

crimes_df2 = spark.read.format('csv') \
                .options(header='true') \
                .schema(crimes_schema) \
                .load("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv")

crimes_df = crimes_df1.union(crimes_df2)

# Filter valid coordinates and create geometry column for crimes
crimes_df = crimes_df.filter((col("LAT").isNotNull()) & (col("LON").isNotNull()))
crimes_df = crimes_df.withColumn("geometry", ST_Point(col("LON").cast("double"), col("LAT").cast("double")))

crimes_df.createOrReplaceTempView("crimes")

# Filter valid coordinates and create geometry column for police stations
police_station_df = police_station_df.filter((col("X").isNotNull()) & (col("Y").isNotNull()))
police_station_df = police_station_df.withColumn("geometry", ST_Point(col("X").cast("double"), col("Y").cast("double")))

police_station_df.createOrReplaceTempView("police_stations")

# Compute the distance between crimes and police stations
# using Sedona's ST_Distance function.
from pyspark.sql.functions import avg

distance_df = spark.sql("""
    SELECT 
        p.DIVISION AS division,
        c.DR_NO AS crime_id,
        p.FID AS police_station_id,
        ST_DistanceSphere(c.geometry, p.geometry) AS distance
    FROM 
        crimes c, 
        police_stations p
""")

# Filtering the NULL distances
distance_df = distance_df.filter(col("distance").isNotNull())

# Identify the nearest police station for each crime
# row_number: Assigns a unique rank to each police station for each crime based on the distance.
# The nearest station gets rank 1.

window = Window.partitionBy("crime_id").orderBy("distance")
nearest_ps_df = distance_df.withColumn("row_num", row_number().over(window)) \
    .filter(col("row_num") == 1) \
    .drop("row_num")


# Group by police station and count the number of nearest crimes
crime_count_df = nearest_ps_df.groupBy("police_station_id").agg(
    count("crime_id").alias("crime_count")
)

# Calculate the average distance for each police station
average_distance_df = nearest_ps_df.groupBy("police_station_id", "division").agg(
    avg("distance").alias("average_distance")
)


final_df = crime_count_df.join(
    average_distance_df,
    on="police_station_id",
    how="inner"
)

final_df.select(
    col("division"),
    col("crime_count"),
    col("average_distance")).orderBy(col("crime_count").desc()).show(truncate=False)

# Stop timing and print out the execution duration
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+-----------+------------------+
|division        |crime_count|average_distance  |
+----------------+-----------+------------------+
|HOLLYWOOD       |224340     |2076.23809789341  |
|VAN NUYS        |210134     |2953.338845065694 |
|SOUTHWEST       |188901     |2191.340656892832 |
|WILSHIRE        |185996     |2592.745867768168 |
|77TH STREET     |171827     |1716.5037087811047|
|OLYMPIC         |170824     |1723.0799804793512|
|NORTH HOLLYWOOD |167854     |2642.9766238125007|
|PACIFIC         |161359     |3850.0806994385694|
|CENTRAL         |153871     |992.5207642454992 |
|RAMPART         |152809     |1535.1721490412608|
|SOUTHEAST       |152176     |2421.787272301805 |
|WEST VALLEY     |138643     |3035.680867104131 |
|TOPANGA         |138217     |3297.024719991776 |
|FOOTHILL        |134896     |4250.907375925852 |
|HARBOR          |126747     |3702.5396866918204|
|HOLLENBECK      |119294     |366921.2363563665 |
|WEST LOS ANGELES|115781     |2792.4871776997666|
