In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
10,application_1765289937462_0011,pyspark,idle,Link,Link,,
11,application_1765289937462_0012,pyspark,idle,Link,Link,,
15,application_1765289937462_0016,pyspark,idle,Link,Link,,
16,application_1765289937462_0017,pyspark,idle,Link,Link,,
17,application_1765289937462_0018,pyspark,idle,Link,Link,,
18,application_1765289937462_0019,pyspark,idle,Link,Link,,
20,application_1765289937462_0021,pyspark,idle,Link,Link,,
22,application_1765289937462_0023,pyspark,idle,Link,Link,,
23,,pyspark,starting,,,,


In [2]:
from pyspark.sql import SparkSession

from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col  #import needed functions

spark = SparkSession \
    .builder \
    .appName("DF query 3 execution") \
    .getOrCreate()


crime_schema = StructType([
        StructField("DR_NO", StringType()),
        StructField("Date Rptd", StringType()),
        StructField("DATE OCC", StringType()),
        StructField("TIME OCC", StringType()),
        StructField("AREA", StringType()),
        StructField("AREA ΝΑΜΕ", StringType()),
        StructField("Rpt Dist No", StringType()),
        StructField("Part 1-2", StringType()),
        StructField("Crm Cd", StringType()),
        StructField("Crm Cd Desc", StringType()),
        StructField("Mocodes", StringType()),
        StructField("Vict Age", IntegerType()),
        StructField("Vict Sex", StringType()),
        StructField("Vict Descent", StringType()),
        StructField("Premis Cd", StringType()),
        StructField("Premis Desc", StringType()),
        StructField("Weapon Used Cd", StringType()),
        StructField("Weapon Desc", StringType()),
        StructField("Status", StringType()),
        StructField("Status Desc", StringType()),
        StructField("Crm Cd 1", StringType()),
        StructField("Crm Cd 2", StringType()),
        StructField("Crm Cd 3", StringType()),
        StructField("Crm Cd 4", StringType()),
        StructField("LOCATION", StringType()),
        StructField("Cross Street", StringType()),
        #StructField("LAT", DoubleType()),
        #StructField("LON", DoubleType()),
    ])

crime_df1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=False, \
    schema=crime_schema)

crime_df2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=False, \
    schema=crime_schema)

# make a single dataframe for crime data from 2010 to present
crime_df = crime_df1.union(crime_df2)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
24,application_1765289937462_0025,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
from pyspark.sql.functions import col, split, explode, trim, desc

# raw text file: one line per code
raw_mo = spark.read.text(
    "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/MO_codes.txt"
)

# assuming: "<code> <spaces or tab> <description...>"
mo_dict_df = (
    raw_mo
    .select(
        trim(split(col("value"), r"\s+", 2).getItem(0)).alias("MO_CODE"),
        trim(split(col("value"), r"\s+", 2).getItem(1)).alias("MO_DESC")
    )
)

# # If it's "code<TAB>description"
# mo_dict_df = (
#     raw_mo
#     .select(
#         trim(split(col("value"), r"\t", 2).getItem(0)).alias("MO_CODE"),
#         trim(split(col("value"), r"\t", 2).getItem(1)).alias("MO_DESC")
#     )
# )
mo_dict_df.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------------------+
|MO_CODE|            MO_DESC|
+-------+-------------------+
|   0100|Suspect Impersonate|
|   0101|         Aid victim|
|   0102|              Blind|
|   0103|Physically disabled|
|   0104|           Customer|
|   0105|           Delivery|
|   0106|             Doctor|
|   0107|                God|
|   0108|             Infirm|
|   0109|          Inspector|
+-------+-------------------+
only showing top 10 rows

In [15]:
# Optional but useful for determinism: turn off AQE, otherwise Spark may change strategy at runtime
# spark.conf.set("spark.sql.adaptive.enabled", "false")

import time



# 1) Keep rows with non-null Mocodes
mocodes_exploded = (
    crime_df
    .filter(col("Mocodes").isNotNull())
    .select(
        explode(
            split(trim(col("Mocodes")), r"\s+")  # split on 1+ spaces
        ).alias("MO_CODE")
    )
    .filter(col("MO_CODE") != "")  # drop empty pieces
)

mo_counts = (
    mocodes_exploded
    .groupBy("MO_CODE")
    .count()
)

start = time.time()

result = (
    mo_counts
    .join(mo_dict_df, on="MO_CODE", how="left")
    .orderBy(desc("count"))
)

result.show(10, truncate=False)
result.explain()

end = time.time()
print("Execution time:", end - start, "seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------+---------------------------+
|MO_CODE|count  |MO_DESC                    |
+-------+-------+---------------------------+
|0344   |1002900|Removes vict property      |
|1822   |548422 |Stranger                   |
|0416   |404773 |Hit-Hit w/ weapon          |
|0329   |377536 |Vandalized                 |
|0913   |278618 |Victim knew Suspect        |
|2000   |256188 |Domestic violence          |
|1300   |219082 |Vehicle involved           |
|0400   |213165 |Force used                 |
|1402   |177470 |Evidence Booked (any crime)|
|1609   |131229 |Smashed                    |
+-------+-------+---------------------------+
only showing top 10 rows

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#985L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#985L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=3632]
      +- Project [MO_CODE#981, count#985L, MO_DESC#144]
         +- BroadcastHashJoin [MO_CODE#981], [MO_CODE#143], LeftOute

In [11]:
# Note that hints are advisory (Spark may ignore them in pathological cases) and AQE can override them unless disabled.

start = time.time()

result_broadcast = (
    mo_counts
    .join(
        mo_dict_df.hint("broadcast"),   # or .hint("BROADCAST")
        on="MO_CODE",
        how="left"
    )
    .orderBy(desc("count"))
)

result_broadcast.show(10, truncate=False)
result_broadcast.explain()

end = time.time()
print("Execution time:", end - start, "seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------+---------------------------+
|MO_CODE|count  |MO_DESC                    |
+-------+-------+---------------------------+
|0344   |1002900|Removes vict property      |
|1822   |548422 |Stranger                   |
|0416   |404773 |Hit-Hit w/ weapon          |
|0329   |377536 |Vandalized                 |
|0913   |278618 |Victim knew Suspect        |
|2000   |256188 |Domestic violence          |
|1300   |219082 |Vehicle involved           |
|0400   |213165 |Force used                 |
|1402   |177470 |Evidence Booked (any crime)|
|1609   |131229 |Smashed                    |
+-------+-------+---------------------------+
only showing top 10 rows

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#615L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#615L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=2352]
      +- Project [MO_CODE#611, count#615L, MO_DESC#144]
         +- BroadcastHashJoin [MO_CODE#611], [MO_CODE#143], LeftOute

In [12]:
start = time.time()

result_merge = (
    mo_counts
    .hint("merge")
    .join(
        mo_dict_df.hint("merge"),
        on="MO_CODE",
        how="left"
    )
    .orderBy(desc("count"))
)

result_merge.show(10, truncate=False)
result_merge.explain()

end = time.time()
print("Execution time:", end - start, "seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------+---------------------------+
|MO_CODE|count  |MO_DESC                    |
+-------+-------+---------------------------+
|0344   |1002900|Removes vict property      |
|1822   |548422 |Stranger                   |
|0416   |404773 |Hit-Hit w/ weapon          |
|0329   |377536 |Vandalized                 |
|0913   |278618 |Victim knew Suspect        |
|2000   |256188 |Domestic violence          |
|1300   |219082 |Vehicle involved           |
|0400   |213165 |Force used                 |
|1402   |177470 |Evidence Booked (any crime)|
|1609   |131229 |Smashed                    |
+-------+-------+---------------------------+
only showing top 10 rows

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#615L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#615L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=2706]
      +- Project [MO_CODE#611, count#615L, MO_DESC#144]
         +- SortMergeJoin [MO_CODE#611], [MO_CODE#143], LeftOuter
  

In [13]:
start = time.time()

result_shuffle_hash = (
    mo_counts
    .hint("shuffle_hash")
    .join(
        mo_dict_df.hint("shuffle_hash"),
        on="MO_CODE",
        how="left"
    )
    .orderBy(desc("count"))
)

result_shuffle_hash.show(10, truncate=False)
result_shuffle_hash.explain()

end = time.time()
print("Execution time:", end - start, "seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------+---------------------------+
|MO_CODE|count  |MO_DESC                    |
+-------+-------+---------------------------+
|0344   |1002900|Removes vict property      |
|1822   |548422 |Stranger                   |
|0416   |404773 |Hit-Hit w/ weapon          |
|0329   |377536 |Vandalized                 |
|0913   |278618 |Victim knew Suspect        |
|2000   |256188 |Domestic violence          |
|1300   |219082 |Vehicle involved           |
|0400   |213165 |Force used                 |
|1402   |177470 |Evidence Booked (any crime)|
|1609   |131229 |Smashed                    |
+-------+-------+---------------------------+
only showing top 10 rows

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#615L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#615L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=3031]
      +- Project [MO_CODE#611, count#615L, MO_DESC#144]
         +- ShuffledHashJoin [MO_CODE#611], [MO_CODE#143], LeftOuter

In [14]:
start = time.time()

result_shuffle_repl_nl = (
    mo_counts
    .hint("shuffle_replicate_nl")
    .join(
        mo_dict_df.hint("shuffle_replicate_nl"),
        on="MO_CODE",
        how="left"
    )
    .orderBy(desc("count"))
)

result_shuffle_repl_nl.show(10, truncate=False)
result_shuffle_repl_nl.explain()

end = time.time()
print("Execution time:", end - start, "seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+-------+---------------------------+
|MO_CODE|count  |MO_DESC                    |
+-------+-------+---------------------------+
|0344   |1002900|Removes vict property      |
|1822   |548422 |Stranger                   |
|0416   |404773 |Hit-Hit w/ weapon          |
|0329   |377536 |Vandalized                 |
|0913   |278618 |Victim knew Suspect        |
|2000   |256188 |Domestic violence          |
|1300   |219082 |Vehicle involved           |
|0400   |213165 |Force used                 |
|1402   |177470 |Evidence Booked (any crime)|
|1609   |131229 |Smashed                    |
+-------+-------+---------------------------+
only showing top 10 rows

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#615L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#615L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=3332]
      +- Project [MO_CODE#611, count#615L, MO_DESC#144]
         +- BroadcastHashJoin [MO_CODE#611], [MO_CODE#143], LeftOute