In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
597,application_1765289937462_0590,pyspark,idle,Link,Link,,
617,application_1765289937462_0610,pyspark,idle,Link,Link,,
618,application_1765289937462_0611,pyspark,idle,Link,Link,,
620,application_1765289937462_0613,pyspark,idle,Link,Link,,
624,application_1765289937462_0617,pyspark,idle,Link,Link,,
629,application_1765289937462_0622,pyspark,idle,Link,Link,,
632,application_1765289937462_0625,pyspark,idle,Link,Link,,
637,application_1765289937462_0630,pyspark,idle,Link,Link,,
642,application_1765289937462_0635,pyspark,idle,Link,Link,,
649,application_1765289937462_0642,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col, split, explode, trim, desc  #import needed functions
import time

spark = SparkSession \
    .builder \
    .appName("DF query 3 execution") \
    .getOrCreate()


crime_schema = StructType([
        StructField("DR_NO", StringType()),
        StructField("Date Rptd", StringType()),
        StructField("DATE OCC", StringType()),
        StructField("TIME OCC", StringType()),
        StructField("AREA", StringType()),
        StructField("AREA ΝΑΜΕ", StringType()),
        StructField("Rpt Dist No", StringType()),
        StructField("Part 1-2", StringType()),
        StructField("Crm Cd", StringType()),
        StructField("Crm Cd Desc", StringType()),
        StructField("Mocodes", StringType()),
        StructField("Vict Age", IntegerType()),
        StructField("Vict Sex", StringType()),
        StructField("Vict Descent", StringType()),
        StructField("Premis Cd", StringType()),
        StructField("Premis Desc", StringType()),
        StructField("Weapon Used Cd", StringType()),
        StructField("Weapon Desc", StringType()),
        StructField("Status", StringType()),
        StructField("Status Desc", StringType()),
        StructField("Crm Cd 1", StringType()),
        StructField("Crm Cd 2", StringType()),
        StructField("Crm Cd 3", StringType()),
        StructField("Crm Cd 4", StringType()),
        StructField("LOCATION", StringType()),
        StructField("Cross Street", StringType()),
        #StructField("LAT", DoubleType()),
        #StructField("LON", DoubleType()),
    ])

crime_df1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=False, \
    schema=crime_schema)

crime_df2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=False, \
    schema=crime_schema)

# make a single dataframe for crime data from 2010 to present
crime_df = crime_df1.union(crime_df2)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
652,application_1765289937462_0645,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# raw text file: one line per code
raw_mo = spark.read.text(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt"
)

# assuming: "<code> <spaces or tab> <description...>"
mo_dict_df = (
    raw_mo
    .select(
        trim(split(col("value"), r"\s+", 2).getItem(0)).alias("MO_CODE"),
        trim(split(col("value"), r"\s+", 2).getItem(1)).alias("MO_DESC")
    )
)

# Keep rows with non-null Mocodes
mocodes_exploded = (
    crime_df
    .filter(col("Mocodes").isNotNull())
    .select(
        explode(
            split(trim(col("Mocodes")), r"\s+")  # split on 1+ spaces
        ).alias("MO_CODE")
    )
    .filter(col("MO_CODE") != "")  # drop empty pieces
)

mo_counts = (
    mocodes_exploded
    .groupBy("MO_CODE")
    .count()
)

start = time.time()

mo_counts.show(10, truncate=False)
mo_dict_df.show(10, truncate=False)

end = time.time()
print("Execution time:", end - start, "seconds")

mo_counts.explain()
mo_dict_df.explain()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-----+
|MO_CODE|count|
+-------+-----+
|0543   |221  |
|1512   |41   |
|0851   |157  |
|0401   |13049|
|1280   |29   |
|0201   |608  |
|1008   |1071 |
|0371   |15199|
|0385   |41926|
|0908   |1837 |
+-------+-----+
only showing top 10 rows

+-------+-------------------+
|MO_CODE|MO_DESC            |
+-------+-------------------+
|0100   |Suspect Impersonate|
|0101   |Aid victim         |
|0102   |Blind              |
|0103   |Physically disabled|
|0104   |Customer           |
|0105   |Delivery           |
|0106   |Doctor             |
|0107   |God                |
|0108   |Infirm             |
|0109   |Inspector          |
+-------+-------------------+
only showing top 10 rows

Execution time: 13.864091634750366 seconds
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[MO_CODE#149], functions=[count(1)], schema specialized)
   +- Exchange hashpartitioning(MO_CODE#149, 1000), ENSURE_REQUIREMENTS, [plan_id=173]
      +- HashAggregate(keys=[MO_CODE#14

In [4]:

result = (
    mo_counts
    .join(mo_dict_df, on="MO_CODE", how="left")
    .orderBy(desc("count"))
)

start = time.time()

result.show(10, truncate=False)

end = time.time()
print("Execution time:", end - start, "seconds")

result.explain()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------+---------------------------+
|MO_CODE|count  |MO_DESC                    |
+-------+-------+---------------------------+
|0344   |1002900|Removes vict property      |
|1822   |548422 |Stranger                   |
|0416   |404773 |Hit-Hit w/ weapon          |
|0329   |377536 |Vandalized                 |
|0913   |278618 |Victim knew Suspect        |
|2000   |256188 |Domestic violence          |
|1300   |219082 |Vehicle involved           |
|0400   |213165 |Force used                 |
|1402   |177470 |Evidence Booked (any crime)|
|1609   |131229 |Smashed                    |
+-------+-------+---------------------------+
only showing top 10 rows

Execution time: 5.888071298599243 seconds
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#153L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#153L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=469]
      +- Project [MO_CODE#149, count#153L, MO_DESC#144]
         +- BroadcastHashJo

In [5]:
# Note that hints are advisory (Spark may ignore them in pathological cases) and AQE can override them unless disabled.
result_broadcast = (
    mo_counts
    .join(
        mo_dict_df.hint("broadcast"),
        on="MO_CODE",
        how="left"
    )
    .orderBy(desc("count"))
)

start = time.time()

result_broadcast.show(10, truncate=False)
end = time.time()
print("Execution time:", end - start, "seconds")

result_broadcast.explain()



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------+---------------------------+
|MO_CODE|count  |MO_DESC                    |
+-------+-------+---------------------------+
|0344   |1002900|Removes vict property      |
|1822   |548422 |Stranger                   |
|0416   |404773 |Hit-Hit w/ weapon          |
|0329   |377536 |Vandalized                 |
|0913   |278618 |Victim knew Suspect        |
|2000   |256188 |Domestic violence          |
|1300   |219082 |Vehicle involved           |
|0400   |213165 |Force used                 |
|1402   |177470 |Evidence Booked (any crime)|
|1609   |131229 |Smashed                    |
+-------+-------+---------------------------+
only showing top 10 rows

Execution time: 2.570488929748535 seconds
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#153L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#153L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=769]
      +- Project [MO_CODE#149, count#153L, MO_DESC#144]
         +- BroadcastHashJo

In [6]:
result_merge = (
    mo_counts
    .hint("merge")
    .join(
        mo_dict_df.hint("merge"),
        on="MO_CODE",
        how="left"
    )
    .orderBy(desc("count"))
)

start = time.time()

result_merge.show(10, truncate=False)

end = time.time()
print("Execution time:", end - start, "seconds")

result_merge.explain()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------+---------------------------+
|MO_CODE|count  |MO_DESC                    |
+-------+-------+---------------------------+
|0344   |1002900|Removes vict property      |
|1822   |548422 |Stranger                   |
|0416   |404773 |Hit-Hit w/ weapon          |
|0329   |377536 |Vandalized                 |
|0913   |278618 |Victim knew Suspect        |
|2000   |256188 |Domestic violence          |
|1300   |219082 |Vehicle involved           |
|0400   |213165 |Force used                 |
|1402   |177470 |Evidence Booked (any crime)|
|1609   |131229 |Smashed                    |
+-------+-------+---------------------------+
only showing top 10 rows

Execution time: 3.2403318881988525 seconds
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#153L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#153L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=1123]
      +- Project [MO_CODE#149, count#153L, MO_DESC#144]
         +- SortMergeJoin

In [7]:
result_shuffle_hash = (
    mo_counts
    .hint("shuffle_hash")
    .join(
        mo_dict_df.hint("shuffle_hash"),
        on="MO_CODE",
        how="left"
    )
    .orderBy(desc("count"))
)

start = time.time()

result_shuffle_hash.show(10, truncate=False)

end = time.time()
print("Execution time:", end - start, "seconds")

result_shuffle_hash.explain()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------+---------------------------+
|MO_CODE|count  |MO_DESC                    |
+-------+-------+---------------------------+
|0344   |1002900|Removes vict property      |
|1822   |548422 |Stranger                   |
|0416   |404773 |Hit-Hit w/ weapon          |
|0329   |377536 |Vandalized                 |
|0913   |278618 |Victim knew Suspect        |
|2000   |256188 |Domestic violence          |
|1300   |219082 |Vehicle involved           |
|0400   |213165 |Force used                 |
|1402   |177470 |Evidence Booked (any crime)|
|1609   |131229 |Smashed                    |
+-------+-------+---------------------------+
only showing top 10 rows

Execution time: 2.47884464263916 seconds
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#153L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#153L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=1448]
      +- Project [MO_CODE#149, count#153L, MO_DESC#144]
         +- ShuffledHashJoi

In [8]:
result_shuffle_repl_nl = (
    mo_counts
    .hint("shuffle_replicate_nl")
    .join(
        mo_dict_df.hint("shuffle_replicate_nl"),
        on="MO_CODE",
        how="left"
    )
    .orderBy(desc("count"))
)

start = time.time()

result_shuffle_repl_nl.show(10, truncate=False)

end = time.time()
print("Execution time:", end - start, "seconds")

result_shuffle_repl_nl.explain()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------+---------------------------+
|MO_CODE|count  |MO_DESC                    |
+-------+-------+---------------------------+
|0344   |1002900|Removes vict property      |
|1822   |548422 |Stranger                   |
|0416   |404773 |Hit-Hit w/ weapon          |
|0329   |377536 |Vandalized                 |
|0913   |278618 |Victim knew Suspect        |
|2000   |256188 |Domestic violence          |
|1300   |219082 |Vehicle involved           |
|0400   |213165 |Force used                 |
|1402   |177470 |Evidence Booked (any crime)|
|1609   |131229 |Smashed                    |
+-------+-------+---------------------------+
only showing top 10 rows

Execution time: 2.155559778213501 seconds
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#153L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#153L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=1749]
      +- Project [MO_CODE#149, count#153L, MO_DESC#144]
         +- BroadcastHashJ