In [1]:
%%configure -f
{
    "conf":{
        "spark.executor.instances": "4",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
550,application_1765289937462_0543,pyspark,idle,Link,Link,,
551,application_1765289937462_0544,pyspark,idle,Link,Link,,
558,application_1765289937462_0551,pyspark,idle,Link,Link,,
568,application_1765289937462_0561,pyspark,idle,Link,Link,,
570,application_1765289937462_0563,pyspark,idle,Link,Link,,
571,application_1765289937462_0564,pyspark,idle,Link,Link,,
572,application_1765289937462_0565,pyspark,idle,Link,Link,,
573,application_1765289937462_0566,pyspark,idle,Link,Link,,
574,application_1765289937462_0567,pyspark,idle,Link,Link,,


In [2]:
from sedona.spark import *
from pyspark.sql import SparkSession


from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import udf, year, avg, count, concat, lit, round, rank, col, regexp_replace, substring, corr

spark = SparkSession \
    .builder \
    .appName("query 4 execution") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)


crime_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Crime_Data/LA_Crime_Data_2020_2025.csv", header = True, inferSchema = True)
# crime

geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_Census_Blocks_2020.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type") 
# census

median_household_income= "s3://initial-notebook-data-bucket-dblab-905418150721/project_data/LA_income_2021.csv"
# median_df = spark.read.csv(median_household_income, header = True, inferSchema = True)
median_df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .option("delimiter", ";") 
    .csv(median_household_income)
)
#median_household_income

# Print schema
flattened_df.printSchema()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
575,application_1765289937462_0568,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BG20: string (nullable = true)
 |-- BG20FIP_CURRENT: string (nullable = true)
 |-- BGFIP20: string (nullable = true)
 |-- CB20: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITYCOMM: string (nullable = true)
 |-- CITYCOMM_CURRENT: string (nullable = true)
 |-- CITY_CURRENT: string (nullable = true)
 |-- COMM: string (nullable = true)
 |-- COMM_CURRENT: string (nullable = true)
 |-- COUNTY: string (nullable = true)
 |-- CT20: string (nullable = true)
 |-- CTCB20: string (nullable = true)
 |-- FEAT_TYPE: string (nullable = true)
 |-- FIP20: string (nullable = true)
 |-- FIP_CURRENT: string (nullable = true)
 |-- HD22: long (nullable = true)
 |-- HD_NAME: string (nullable = true)
 |-- HOUSING20: long (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- POP20: long (nullable = true)
 |-- SPA22: long (nullable = true)
 |-- SPA_NAME: string (nullable = true)
 |-- SUP21: string (nullable = true)
 |-- SUP_LABEL: string (nullable = true)
 |-- ShapeSTArea: 

In [3]:
# last year
from pyspark.sql.functions import sum as _sum

#select only desired columns from 2020_Census_Blocks.geojson, filter data only from LA, keep only valid data and then do summations for Housing and Population and create geometry
group_flattened = (
    flattened_df
    .select("COMM", "POP20", "ZCTA20", "HOUSING20" , "geometry")
    .filter(
        (col("ZCTA20") > 0) &
        (col("HOUSING20") > 0) &
        (col("POP20") > 0) &
        (col("COMM") != "")
    )
    .groupBy("COMM", "ZCTA20")
    .agg(
        _sum("POP20").alias("Total_POP"),
        _sum("HOUSING20").alias("Total_Housing"),
        ST_Union_Aggr("geometry").alias("geometry")
    )
)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
#join Cencus with median house income. Join key is Zip Code. Result is GDP Per Capita in every area.
joined = (
    group_flattened
    .join(median_df, group_flattened["ZCTA20"] == median_df["Zip Code"])
    .withColumn(
        "Estimated Median Income", 
        regexp_replace(col("Estimated Median Income"), "[^0-9]", "")
    )
    .withColumn(
        "ZIP_Total_Income", 
        (col("Estimated Median Income") * col("Total_Housing"))
    )
    .groupBy("COMM")
    .agg(
        _sum("Total_POP").alias("Total_COMM_Pop"),
        _sum("ZIP_Total_Income").alias("COMM_Total_Income"),
        ST_Union_Aggr("geometry").alias("geometry")
    )
    .withColumn(
        "GDP_Per_Capita",
        (col("COMM_Total_Income")/col("Total_COMM_Pop"))
    )
    .select("COMM", "GDP_Per_Capita", "geometry") 
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
#keep crime data only from 2020 and 2021. Also create a geometry column according to longtitude and latitude columns.
crime_data_20_21_df = (
    crime_df
    .filter(
        (substring(col("DATE OCC"), 1, 4) == "2020") |
        (substring(col("DATE OCC"), 1, 4) == "2021") 
    )
    .select("DATE OCC", "LAT", "LON")
    .withColumn("geom", ST_Point("LON", "LAT"))
    .drop("LAT", "LON")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
#final_top_3_df.show()

final_crimes_df = (
    joined
    .join(crime_data_20_21_df, ST_Within(crime_data_20_21_df.geom, joined.geometry), "inner")
    .groupBy("COMM")
    .agg(
        count("*").alias("#")
    )
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
#join Cencus with crimes per COMM. Join key is Zip Code. Result is crimes per capita in every area.
crime_joined = (
    group_flattened
    .join(final_crimes_df, on="COMM", how="inner")
    .withColumn(
        "Crimes_Per_Capita",
        (col("#")/col("Total_Pop"))
    )
    .select("COMM", "Crimes_Per_Capita") #geometry 
)

top_10 = joined.orderBy(col("GDP_Per_Capita"), ascending=False).limit(10)

bottom_10 = joined.orderBy(col("GDP_Per_Capita"), ascending=True).limit(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
from pyspark.sql.functions import corr
import time

start = time.time()

#Corelation while using all areas
corr_joined = (
    joined
    .join(crime_joined, on="COMM", how="inner")
    .select("COMM", "GDP_Per_Capita", "Crimes_Per_Capita") 
)

full_corr = corr_joined.select(
    corr("GDP_Per_Capita", "Crimes_Per_Capita")
).collect()[0][0]

print("Correlation =", full_corr)

#Corelation while using top10 income areas
corr_top10 = (
    top_10
    .join(crime_joined, on="COMM", how="inner") # Broadcast Hash Join
    .select("COMM", "GDP_Per_Capita", "Crimes_Per_Capita") 
)

full_corr_top10 = corr_top10.select(
    corr("GDP_Per_Capita", "Crimes_Per_Capita")
).collect()[0][0]

print("Top 10 Correlation =", full_corr_top10)

#Corelation while using the bottom 10 areas
corr_bottom10 = (
    bottom_10
    .join(crime_joined, on="COMM", how="inner") # Broadcast Hash Join
    .select("COMM", "GDP_Per_Capita", "Crimes_Per_Capita") 
)

full_corr_bot10 = corr_bottom10.select(
    corr("GDP_Per_Capita", "Crimes_Per_Capita")
).collect()[0][0]

print("Bottom 10 Correlation =", full_corr_bot10)
end = time.time()
print("Execution time:", end - start, "seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Correlation = 0.05699006354516789
Top 10 Correlation = 0.2930302995361155
Bottom 10 Correlation = 0.17968480714580115
Execution time: 63.09091138839722 seconds

In [9]:
corr_joined.explain()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [COMM#124, GDP_Per_Capita#394, Crimes_Per_Capita#497]
   +- SortMergeJoin [COMM#124], [COMM#522], Inner
      :- Sort [COMM#124 ASC NULLS FIRST], false, 0
      :  +- HashAggregate(keys=[COMM#124], functions=[sum(ZIP_Total_Income#350), sum(Total_POP#299L)], schema specialized)
      :     +- Exchange hashpartitioning(COMM#124, 1000), ENSURE_REQUIREMENTS, [plan_id=5885]
      :        +- HashAggregate(keys=[COMM#124], functions=[partial_sum(ZIP_Total_Income#350), partial_sum(Total_POP#299L)], schema specialized)
      :           +- Project [COMM#124, Total_POP#299L, (cast(regexp_replace(Estimated Median Income#284, [^0-9], , 1) as double) * cast(Total_Housing#301L as double)) AS ZIP_Total_Income#350]
      :              +- BroadcastHashJoin [cast(ZCTA20#144 as int)], [Zip Code#282], Inner, BuildRight, false
      :                 :- HashAggregate(keys=[COMM#124, ZCTA20#144], functions=[sum(POP20#136L), sum(HOUSING20#1

In [10]:
corr_top10.explain()
corr_bottom10.explain()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [COMM#124, GDP_Per_Capita#394, Crimes_Per_Capita#497]
   +- BroadcastHashJoin [COMM#124], [COMM#991], Inner, BuildLeft, false
      :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true]),false), [plan_id=6346]
      :  +- TakeOrderedAndProject(limit=10, orderBy=[GDP_Per_Capita#394 DESC NULLS LAST], output=[COMM#124,GDP_Per_Capita#394])
      :     +- HashAggregate(keys=[COMM#124], functions=[sum(ZIP_Total_Income#350), sum(Total_POP#299L)], schema specialized)
      :        +- Exchange hashpartitioning(COMM#124, 1000), ENSURE_REQUIREMENTS, [plan_id=6319]
      :           +- HashAggregate(keys=[COMM#124], functions=[partial_sum(ZIP_Total_Income#350), partial_sum(Total_POP#299L)], schema specialized)
      :              +- Project [COMM#124, Total_POP#299L, (cast(regexp_replace(Estimated Median Income#284, [^0-9], , 1) as double) * cast(Total_Housing#301L as double)) AS ZIP_Total_Income#350]
     