In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4",
        "spark.driver.memory": "8g"
    }
}


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3283,application_1732639283265_3239,pyspark,idle,Link,Link,,
3292,application_1732639283265_3248,pyspark,idle,Link,Link,,
3297,application_1732639283265_3253,pyspark,idle,Link,Link,,
3307,application_1732639283265_3263,pyspark,idle,Link,Link,,
3349,application_1732639283265_3305,pyspark,idle,Link,Link,,
3399,application_1732639283265_3355,pyspark,idle,Link,Link,,
3417,application_1732639283265_3373,pyspark,idle,Link,Link,,
3419,application_1732639283265_3375,pyspark,idle,Link,Link,,
3420,application_1732639283265_3376,pyspark,idle,Link,Link,,
3421,application_1732639283265_3377,pyspark,idle,Link,Link,,


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from sedona.spark import *
from sedona.sql import *
import time

spark = SparkSession \
    .builder \
    .appName("query4") \
    .getOrCreate()

start_time = time.time()


income_schema = StructType([
    StructField("ZIP", StringType(), True),
    StructField("Comm", StringType(), True),
    StructField("income", StringType(), True),
])

income_df = spark.read.format('csv') \
                .options(header='true') \
                .schema(income_schema) \
                .load("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv")

sedona = SedonaContext.create(spark)

# Define crime data schema
crimes_schema = StructType([
    StructField("DR_NO", StringType(), True),
    StructField("Date_Rptd", StringType(), True),
    StructField("DATE_OCC", StringType(), True),
    StructField("TIME_OCC", StringType(), True),
    StructField("AREA", StringType(), True),
    StructField("AREA_NAME", StringType(), True),
    StructField("Rpt_Dist_No", StringType(), True),
    StructField("Part_1-2", StringType(), True),
    StructField("Crm_Cd", StringType(), True),
    StructField("Crm_Cd_Desc", StringType(), True),
    StructField("Mocodes", StringType(), True),
    StructField("Vict_Age", StringType(), True),
    StructField("Vict_Sex", StringType(), True),
    StructField("Vict_Descent", StringType(), True),
    StructField("Premis_Cd", StringType(), True),
    StructField("Premis_Desc", StringType(), True),
    StructField("Weapon_Used_Cd", StringType(), True),
    StructField("Weapon_Desc", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Status_Desc", StringType(), True),
    StructField("Crm_Cd_1", StringType(), True),
    StructField("Crm_Cd_2", StringType(), True),
    StructField("Crm_Cd_3", StringType(), True),
    StructField("Crm_Cd_4", StringType(), True),
    StructField("LOCATION", StringType(), True),
    StructField("Cross_Street", StringType(), True),
    StructField("LAT", StringType(), True),
    StructField("LON", StringType(), True)
])

# Read crime data
crimes_df1 = spark.read.format('csv') \
    .options(header='false') \
    .schema(crimes_schema) \
    .load("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv")

crimes_df2 = spark.read.format('csv') \
    .options(header='true') \
    .schema(crimes_schema) \
    .load("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv")

# Combine datasets
crimes_df = crimes_df1.union(crimes_df2)

# Filter valid coordinates and create geometry column for crimes
crimes_df = crimes_df.filter((col("LAT").isNotNull()) & (col("LON").isNotNull()))
crimes_df = crimes_df.withColumn("geometry", ST_Point(col("LON").cast("double"), col("LAT").cast("double")))

geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")


la_df = flattened_df.filter(
    (col("CITY") == "Los Angeles") &
    (col("HOUSING10").isNotNull()) & (col("HOUSING10") > 0) &
    (col("POP_2010").isNotNull()) & (col("POP_2010") > 0)
)


LA_areas = la_df \
    .groupBy("COMM") \
    .agg(
        ST_Union_Aggr("geometry").alias("geometry"),   # Aggregate geometries
        collect_set("ZCTA10").alias("ZIPCodes"),       # Collect unique ZIP codes as a list for each area
        sum("HOUSING10").alias("TotalHousing"),        # Sum of HOUSING10 for each community
        sum("POP_2010").alias("TotalPopulation")       # Sum of POP_2010 for each community
    )

# Clean and prepare the income column in income_df
income_df_cleaned = income_df.withColumn(
    "income_cleaned",
    regexp_replace(col("income"), "[$,]", "").cast("double")  # Remove dollar signs and commas, cast to double
)
income_df_LA = income_df_cleaned.filter(col("Comm").contains("Los Angeles"))
# Rename the columns to avoid ambiguity
income_df_LA = income_df_LA.withColumnRenamed("Comm", "Community")
income_df_LA = income_df_LA.withColumnRenamed("ZIP", "ZIP_Income")

# Join the exploded DataFrame with the income DataFrame on ZipCodes, since comm doesm't match
joined_df = la_df.join(
    income_df_LA,
    la_df["ZCTA10"] == income_df_LA["ZIP_Income"],
    how="inner"
)

# Add the new column : avg_income for each zip
joined_df_zip = joined_df.withColumn(
    "IncomePerCapita_zip",
     col("income_cleaned") * col("HOUSING10") / col("POP_2010")
)

# avg_income for each community (each community has multiple zip codes)
aggregated_df = joined_df_zip.groupBy("COMM").agg(
    round(avg("IncomePerCapita_zip") ,3).alias("IncomePerCapita")
)

# Correct datetime format for parsing
datetime_format = "MM/dd/yyyy hh:mm:ss a"

# Step 1: Convert DATE_OCC to a proper date format and filter for 2015
crimes_2015 = crimes_df.withColumn(
    "Year_OCC",
    year(to_date(col("DATE_OCC"), datetime_format))
).filter(col("Year_OCC") == 2015)

# Step 2: Join crimes with LA areas based on geometry
crimes_in_areas = crimes_2015.alias("crimes").join(
    LA_areas.alias("areas"),
    ST_Contains(col("areas.geometry"), col("crimes.geometry")),
    "inner"
).select(
    col("areas.COMM").alias("Community"),
    col("crimes.Vict_Descent")
)

# Step 3: Filter crimes for top 3 and bottom 3 income areas
crimes_top3 = crimes_in_areas.join(
    aggregated_df.orderBy(desc("IncomePerCapita")).limit(3).withColumnRenamed("COMM", "HighIncomeCOMM"),
    crimes_in_areas["Community"] == col("HighIncomeCOMM"),
    "inner"
).select(
    crimes_in_areas["Community"],
    crimes_in_areas["Vict_Descent"]
)

crimes_bottom3 = crimes_in_areas.join(
    aggregated_df.orderBy(asc("IncomePerCapita")).limit(3).withColumnRenamed("COMM", "LowIncomeCOMM"),
    crimes_in_areas["Community"] == col("LowIncomeCOMM"),
    "inner"
).select(
    crimes_in_areas["Community"],
    crimes_in_areas["Vict_Descent"]
)

# Step 4: Mapping for Vict_Descent codes to descriptions
vict_descent_mapping = {
    "A": "Other Asian",
    "B": "Black",
    "C": "Chinese",
    "D": "Cambodian",
    "F": "Filipino",
    "G": "Guamanian",
    "H": "Hispanic/Latin/Mexican",
    "I": "American Indian/Alaskan Native",
    "J": "Japanese",
    "K": "Korean",
    "L": "Laotian",
    "O": "Other",
    "P": "Pacific Islander",
    "S": "Samoan",
    "U": "Hawaiian",
    "V": "Vietnamese",
    "W": "White",
    "X": "Unknown",
    "Z": "Asian Indian"
}

# Convert mapping dictionary to a DataFrame
mapping_schema = StructType([
    StructField("Code", StringType(), True),
    StructField("Description", StringType(), True)
])
mapping_df = spark.createDataFrame(
    [(code, desc) for code, desc in vict_descent_mapping.items()],
    schema=mapping_schema
)

# Step 5: Join top 3 and bottom 3 crimes with mapping
crimes_top3_with_description = crimes_top3.join(
    mapping_df,
    crimes_top3["Vict_Descent"] == mapping_df["Code"],
    "left"
).select(
    col("Community"),
    col("Description").alias("Victim_Ethnicity")
)

crimes_bottom3_with_description = crimes_bottom3.join(
    mapping_df,
    crimes_bottom3["Vict_Descent"] == mapping_df["Code"],
    "left"
).select(
    col("Community"),
    col("Description").alias("Victim_Ethnicity")
)

# Remove null values from top 3 and bottom 3 results
crimes_top3_with_description = crimes_top3_with_description.filter(col("Victim_Ethnicity").isNotNull())
crimes_bottom3_with_description = crimes_bottom3_with_description.filter(col("Victim_Ethnicity").isNotNull())

# Step 6: Count victims by ethnicity for top 3 areas
race_profile_top3 = crimes_top3_with_description.groupBy(
    "Community", "Victim_Ethnicity"
).count().orderBy(col("Community"), col("count").desc())

# Step 7: Count victims by ethnicity for bottom 3 areas
race_profile_bottom3 = crimes_bottom3_with_description.groupBy(
    "Community", "Victim_Ethnicity"
).count().orderBy(col("Community"), col("count").desc())

# Display all results for the top 3 areas by income
print("Racial Profile of Crime Victims in Top 3 Areas by Income:")
race_profile_top3.show(n=100, truncate=False)

# Display all results for the bottom 3 areas by income
print("Racial Profile of Crime Victims in Bottom 3 Areas by Income:")
race_profile_bottom3.show(n=100, truncate=False)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3464,application_1732639283265_3420,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Racial Profile of Crime Victims in Top 3 Areas by Income:
+-----------------+------------------------------+-----+
|Community        |Victim_Ethnicity              |count|
+-----------------+------------------------------+-----+
|Bel Air          |White                         |148  |
|Bel Air          |Other                         |56   |
|Bel Air          |Hispanic/Latin/Mexican        |15   |
|Bel Air          |Black                         |7    |
|Bel Air          |Unknown                       |6    |
|Bel Air          |Other Asian                   |5    |
|Beverly Crest    |White                         |252  |
|Beverly Crest    |Other                         |77   |
|Beverly Crest    |Hispanic/Latin/Mexican        |23   |
|Beverly Crest    |Black                         |13   |
|Beverly Crest    |Other Asian                   |12   |
|Beverly Crest    |Unknown                       |8    |
|Pacific Palisades|White                         |471  |
|Pacific Palisades|Other      