In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
550,application_1765289937462_0543,pyspark,idle,Link,Link,,
571,application_1765289937462_0564,pyspark,idle,Link,Link,,
575,application_1765289937462_0568,pyspark,idle,Link,Link,,
576,application_1765289937462_0569,pyspark,idle,Link,Link,,
592,application_1765289937462_0585,pyspark,idle,Link,Link,,
593,application_1765289937462_0586,pyspark,idle,Link,Link,,
597,application_1765289937462_0590,pyspark,idle,Link,Link,,
617,application_1765289937462_0610,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col,when, udf  #import needed functions

import time


spark = SparkSession \
    .builder \
    .appName("DF query 1 execution") \
    .getOrCreate()


# Functional Schema for the dataframe queries. The schema can alternatively be inferred
crime_schema = StructType([
        StructField("DR_NO", StringType()),
        StructField("Date Rptd", StringType()),
        StructField("DATE OCC", StringType()),
        StructField("TIME OCC", StringType()),
        StructField("AREA", StringType()),
        StructField("AREA ΝΑΜΕ", StringType()),
        StructField("Rpt Dist No", StringType()),
        StructField("Part 1-2", StringType()),
        StructField("Crm Cd", StringType()),
        StructField("Crm Cd Desc", StringType()),
        StructField("Mocodes", StringType()),
        StructField("Vict Age", IntegerType()),
        StructField("Vict Sex", StringType()),
        StructField("Vict Descent", StringType()),
        StructField("Premis Cd", StringType()),
        StructField("Premis Desc", StringType()),
        StructField("Weapon Used Cd", StringType()),
        StructField("Weapon Desc", StringType()),
        StructField("Status", StringType()),
        StructField("Status Desc", StringType()),
        StructField("Crm Cd 1", StringType()),
        StructField("Crm Cd 2", StringType()),
        StructField("Crm Cd 3", StringType()),
        StructField("Crm Cd 4", StringType()),
        StructField("LOCATION", StringType()),
        StructField("Cross Street", StringType()),
        #StructField("LAT", DoubleType()),
        #StructField("LON", DoubleType()),
    ])

crime_df1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2010_2019.csv", \
    header=False, \
    schema=crime_schema)

crime_df2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", \
    header=False, \
    schema=crime_schema)

# make a single dataframe for crime data from 2010 to present
crime_df = crime_df1.union(crime_df2)

# filter for aggravated assault
agg_df = crime_df.filter(col("Crm Cd Desc").contains("AGGRAVATED ASSAULT"))

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
618,application_1765289937462_0611,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
def age_group(age):
    if age is None:
        return None
    if age < 18:
        return "Children"
    elif age <= 24:
        return "Young Adults"
    elif age <= 64:
        return "Adults"
    else:
        return "Elderly"


age_group_udf = udf(age_group, StringType())

result_udf = (
    agg_df
    .withColumn("age_group", age_group_udf(col("Vict Age")))
    .groupBy("age_group")
    .count()
    .orderBy(col("count").desc())
)


start = time.time()
result_udf.show() #lazy execution means we can count time just for this step
end = time.time()
print("Dataframe with UDF Execution time:", end - start, "seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------+------+
|   age_group| count|
+------------+------+
|      Adults|121660|
|Young Adults| 33758|
|    Children| 16014|
|     Elderly|  6011|
+------------+------+

Dataframe with UDF Execution time: 15.557707071304321 seconds