In [1]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "1",
        "spark.executor.memory": "1g",
        "spark.executor.cores": "1",
        "spark.driver.memory": "2g"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3630,application_1732639283265_3586,pyspark,idle,Link,Link,,
3637,application_1732639283265_3593,pyspark,idle,Link,Link,,
3641,application_1732639283265_3597,pyspark,idle,Link,Link,,
3659,application_1732639283265_3605,pyspark,idle,Link,Link,,
3660,application_1732639283265_3606,pyspark,idle,Link,Link,,
3661,application_1732639283265_3607,pyspark,idle,Link,Link,,
3662,application_1732639283265_3608,pyspark,idle,Link,Link,,
3669,application_1732639283265_3615,pyspark,idle,Link,Link,,
3671,application_1732639283265_3617,pyspark,idle,Link,Link,,
3676,application_1732639283265_3622,pyspark,idle,Link,Link,,


In [54]:
# Spark SQL code
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window
# To log our application's execution time:
import time

spark = SparkSession \
    .builder \
    .appName("query2") \
    .getOrCreate()

police_schema = StructType([
    StructField("X", FloatType(), True),
    StructField("Y", FloatType(), True),
    StructField("FID", IntegerType(), True),
    StructField("DIVISION", StringType(), True),
    StructField("LOCATION", StringType(), True),
    StructField("PREC", IntegerType(), True)
])

crimes_schema = StructType([
    StructField("DR_NO", StringType(), True),
    StructField("Date_Rptd", StringType(), True),
    StructField("DATE_OCC", StringType(), True),
    StructField("TIME_OCC", StringType(), True),
    StructField("AREA", StringType(), True),
    StructField("AREA_NAME", StringType(), True),
    StructField("Rpt_Dist_No", StringType(), True),
    StructField("Part_1-2", StringType(), True),
    StructField("Crm_Cd", StringType(), True),
    StructField("Crm_Cd_Desc", StringType(), True),
    StructField("Mocodes", StringType(), True),
    StructField("Vict_Age", StringType(), True),
    StructField("Vict_Sex", StringType(), True),
    StructField("Vict_Descent", StringType(), True),
    StructField("Premis_Cd", StringType(), True),
    StructField("Premis_Desc", StringType(), True),
    StructField("Weapon_Used_Cd", StringType(), True),
    StructField("Weapon_Desc", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Status_Desc", StringType(), True),
    StructField("Crm_Cd_1", StringType(), True),
    StructField("Crm_Cd_2", StringType(), True),
    StructField("Crm_Cd_3", StringType(), True),
    StructField("Crm_Cd_4", StringType(), True),
    StructField("LOCATION", StringType(), True),
    StructField("Cross_Street", StringType(), True),
    StructField("LAT", StringType(), True),
    StructField("LON", StringType(), True)
])

# Start timing
start_time = time.time()

police_station_df = spark.read.format('csv') \
                .options(header='true') \
                .schema(police_schema) \
                .load("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv")

crimes_df1 = spark.read.format('csv') \
                .options(header='false') \
                .schema(crimes_schema) \
                .load("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv")

crimes_df2 = spark.read.format('csv') \
                .options(header='true') \
                .schema(crimes_schema) \
                .load("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv")

crimes_df = crimes_df1.union(crimes_df2)

# To utilize as SQL tables (replace dataframes to tables for executing sql queries)
police_station_df.createOrReplaceTempView("police_station")
crimes_df.createOrReplaceTempView("crimes")

crimes_query = "SELECT YEAR(TO_TIMESTAMP(Date_Rptd, 'MM/dd/yyyy hh:mm:ss a')) AS Year, \
    AREA_NAME, Status_Desc \
    FROM crimes "

crime_data = spark.sql(crimes_query)

crime_data = crime_data.filter(col("Status_Desc").isNotNull())

# Register `crime_data` as a temporary view
crime_data.createOrReplaceTempView("crime_data")


# SQL query to calculate the total counts and 'IC' counts for each FID
percentage_query = """
    SELECT 
        c.Year, 
        c.AREA_NAME AS Area,
        COUNT(CASE 
            WHEN c.Status_Desc IN ('Adult Arrest', 'Adult Other', 'Juv Arrest', 'Juv Other') 
            THEN 1 END) * 100.0 / COUNT(*) AS Percentage
    FROM 
        police_station p
    INNER JOIN 
        crime_data c
    ON 
        UPPER(c.AREA_NAME) = p.DIVISION
    GROUP BY 
        c.Year, c.AREA_NAME
"""

# Execute the query
percentage_data = spark.sql(percentage_query)

# Define a window partitioned by `Year` and ordered by `Percentage` descending
window_spec = Window.partitionBy("Year").orderBy(col("Percentage").desc())

# Add a rank column to the DataFrame
ranked_data = percentage_data.withColumn("Rank", row_number().over(window_spec))

# Filter the top 3 rows for each year
top_3_per_year = ranked_data.filter(col("Rank") <= 3)

# Sort by Year and Rank
final_result = top_3_per_year.orderBy("Year", "Rank")

# Show the result
final_result.select(
    col("Year"),
    col("Area"),
    col("Percentage"),
    col("Rank")).show()

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+-----------------+----+
|Year|       Area|       Percentage|Rank|
+----+-----------+-----------------+----+
|2010|    Rampart|32.94735585531813|   1|
|2010|    Olympic|31.96270619172842|   2|
|2010|     Harbor|29.63203463203463|   3|
|2011|    Olympic|35.21216768916155|   1|
|2011|    Rampart|32.51177963030083|   2|
|2011|     Harbor|28.65220520201501|   3|
|2012|    Olympic|34.41481831052383|   1|
|2012|    Rampart|32.94641810294290|   2|
|2012|     Harbor|29.81513327601032|   3|
|2013|    Olympic|33.52812271731191|   1|
|2013|    Rampart|32.08287360549222|   2|
|2013|     Harbor|29.16422459266206|   3|
|2014|   Van Nuys|31.80567315834039|   1|
|2014|West Valley|31.31198995605775|   2|
|2014|    Mission|31.16279069767442|   3|
|2015|   Van Nuys|32.64134698172773|   1|
|2015|West Valley|30.27597402597403|   2|
|2015|    Mission|30.17946067838016|   3|
|2016|   Van Nuys|31.88075572011773|   1|
|2016|West Valley|31.54798761609907|   2|
+----+-----------+----------------

In [55]:
# Spark DataFrame code
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col, year, to_timestamp, when, count, row_number, upper
from pyspark.sql.window import Window
import time


# Initialize Spark session
spark = SparkSession.builder.appName("query2_dataframe").getOrCreate()

# Load police station data
police_schema = StructType([
    StructField("X", FloatType(), True),
    StructField("Y", FloatType(), True),
    StructField("FID", IntegerType(), True),
    StructField("DIVISION", StringType(), True),
    StructField("LOCATION", StringType(), True),
    StructField("PREC", IntegerType(), True)
])


# Load crimes data
crimes_schema = StructType([
    StructField("DR_NO", StringType(), True),
    StructField("Date_Rptd", StringType(), True),
    StructField("DATE_OCC", StringType(), True),
    StructField("TIME_OCC", StringType(), True),
    StructField("AREA", StringType(), True),
    StructField("AREA_NAME", StringType(), True),
    StructField("Rpt_Dist_No", StringType(), True),
    StructField("Part_1-2", StringType(), True),
    StructField("Crm_Cd", StringType(), True),
    StructField("Crm_Cd_Desc", StringType(), True),
    StructField("Mocodes", StringType(), True),
    StructField("Vict_Age", StringType(), True),
    StructField("Vict_Sex", StringType(), True),
    StructField("Vict_Descent", StringType(), True),
    StructField("Premis_Cd", StringType(), True),
    StructField("Premis_Desc", StringType(), True),
    StructField("Weapon_Used_Cd", StringType(), True),
    StructField("Weapon_Desc", StringType(), True),
    StructField("Status", StringType(), True),
    StructField("Status_Desc", StringType(), True),
    StructField("Crm_Cd_1", StringType(), True),
    StructField("Crm_Cd_2", StringType(), True),
    StructField("Crm_Cd_3", StringType(), True),
    StructField("Crm_Cd_4", StringType(), True),
    StructField("LOCATION", StringType(), True),
    StructField("Cross_Street", StringType(), True),
    StructField("LAT", StringType(), True),
    StructField("LON", StringType(), True)
])

# Start timimg
start_time = time.time()

police_station_df = spark.read.format('csv') \
    .options(header='true') \
    .schema(police_schema) \
    .load("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv")

crimes_df1 = spark.read.format('csv') \
    .options(header='false') \
    .schema(crimes_schema) \
    .load("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv")

crimes_df2 = spark.read.format('csv') \
    .options(header='true') \
    .schema(crimes_schema) \
    .load("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv")

crimes_df = crimes_df1.union(crimes_df2)

crimes_df = crimes_df.filter(col("Status_Desc").isNotNull())

# Extract required fields and add Year column
crime_data_df = crimes_df \
    .withColumn("Year", year(to_timestamp(col("Date_Rptd"), "MM/dd/yyyy hh:mm:ss a"))) \
    .select("Year", "AREA_NAME", "Status_Desc")

# Join with police station data
joined_data_df = crime_data_df \
    .join(police_station_df, upper(col("AREA_NAME").cast("string")) == col("DIVISION").cast("string"), "inner") \
    .select("Year", "Status_Desc", col("AREA_NAME").alias("Area"), "FID")

# Calculate percentages
percentage_df = joined_data_df \
    .groupBy("Year", "Area") \
    .agg(
        (count(when((col("Status_Desc") == "Adult Arrest") |
                    (col("Status_Desc") == "Adult Other") |
                    (col("Status_Desc") == "Juv Arrest") |
                    (col("Status_Desc") == "Juv Other"), 1)) * 100.0 / count("*")).alias("Percentage")
    )

# Rank areas by percentage for each year
window_spec = Window.partitionBy("Year").orderBy(col("Percentage").desc())
ranked_data_df = percentage_df \
    .withColumn("Rank", row_number().over(window_spec)) \
    .filter(col("Rank") <= 3) \
    .orderBy("Year", "Rank")

# Show results
ranked_data_df.show()

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+------------------+----+
|Year|       Area|        Percentage|Rank|
+----+-----------+------------------+----+
|2010|    Rampart|32.947355855318136|   1|
|2010|    Olympic|31.962706191728426|   2|
|2010|     Harbor| 29.63203463203463|   3|
|2011|    Olympic|35.212167689161554|   1|
|2011|    Rampart|32.511779630300836|   2|
|2011|     Harbor| 28.65220520201501|   3|
|2012|    Olympic|34.414818310523835|   1|
|2012|    Rampart|  32.9464181029429|   2|
|2012|     Harbor|29.815133276010318|   3|
|2013|    Olympic| 33.52812271731191|   1|
|2013|    Rampart| 32.08287360549222|   2|
|2013|     Harbor|29.164224592662055|   3|
|2014|   Van Nuys| 31.80567315834039|   1|
|2014|West Valley|31.311989956057754|   2|
|2014|    Mission|31.162790697674417|   3|
|2015|   Van Nuys|32.641346981727736|   1|
|2015|West Valley|30.275974025974026|   2|
|2015|    Mission|30.179460678380156|   3|
|2016|   Van Nuys|31.880755720117726|   1|
|2016|West Valley| 31.54798761609907|   2|
+----+-----

In [56]:
# Write results to S3 -> 
#    1. create the output directory in your S3 bucket
#    2. change your group number below 
#    3. and uncomment
group_number = "13"
s3_path = "s3://groups-bucket-dblab-905418150721/group"+group_number+"/query2_results/"
ranked_data_df.write.mode("overwrite").parquet(s3_path)
ranked_data_df_again = spark.read.parquet(s3_path)
ranked_data_df_again.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+------------------+----+
|Year|       Area|        Percentage|Rank|
+----+-----------+------------------+----+
|2010|    Rampart|32.947355855318136|   1|
|2010|    Olympic|31.962706191728426|   2|
|2010|     Harbor| 29.63203463203463|   3|
|2011|    Olympic|35.212167689161554|   1|
|2011|    Rampart|32.511779630300836|   2|
|2011|     Harbor| 28.65220520201501|   3|
|2012|    Olympic|34.414818310523835|   1|
|2012|    Rampart|  32.9464181029429|   2|
|2012|     Harbor|29.815133276010318|   3|
|2013|    Olympic| 33.52812271731191|   1|
|2013|    Rampart| 32.08287360549222|   2|
|2013|     Harbor|29.164224592662055|   3|
|2014|   Van Nuys| 31.80567315834039|   1|
|2014|West Valley|31.311989956057754|   2|
|2014|    Mission|31.162790697674417|   3|
|2015|   Van Nuys|32.641346981727736|   1|
|2015|West Valley|30.275974025974026|   2|
|2015|    Mission|30.179460678380156|   3|
|2016|   Van Nuys|31.880755720117726|   1|
|2016|West Valley| 31.54798761609907|   2|
+----+-----

In [57]:
# Save the DataFrames as Parquet files
group_number = "13"
crimes_path="s3://groups-bucket-dblab-905418150721/group"+group_number+"query2_crimes/"
police_station_path="s3://groups-bucket-dblab-905418150721/group"+group_number+"query2_police_station/"
police_station_df.write.mode("overwrite").parquet(police_station_path)
crimes_df.write.mode("overwrite").parquet(crimes_path)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType
from pyspark.sql.functions import col, year, to_timestamp, when, count, row_number, upper
from pyspark.sql.window import Window
import time

# Initialize Spark session
spark = SparkSession.builder.appName("query2_parquet").getOrCreate()

# Start timing
start_time = time.time()

# Load the data from the Parquet files
police_station_df_parquet = spark.read.parquet(police_station_path)
crimes_df_parquet = spark.read.parquet(crimes_path)

# Filter and extract required columns
crime_data_df = crimes_df_parquet \
    .filter(col("Status_Desc").isNotNull()) \
    .withColumn("Year", year(to_timestamp(col("Date_Rptd"), "MM/dd/yyyy hh:mm:ss a"))) \
    .select("Year", "AREA_NAME", "Status_Desc")

# Join with police station data
joined_data_df = crime_data_df \
    .join(police_station_df_parquet, upper(col("AREA_NAME")) == upper(col("DIVISION")), "inner") \
    .select("Year", "Status_Desc", col("AREA_NAME").alias("Area"), "FID")

# Calculate percentages
percentage_df = joined_data_df \
    .groupBy("Year", "Area") \
    .agg(
        (count(when((col("Status_Desc") == "Adult Arrest") |
                    (col("Status_Desc") == "Adult Other") |
                    (col("Status_Desc") == "Juv Arrest") |
                    (col("Status_Desc") == "Juv Other"), 1)) * 100.0 / count("*")).alias("Percentage")
    )

# Rank areas by percentage for each year
window_spec = Window.partitionBy("Year").orderBy(col("Percentage").desc())
ranked_data_df = percentage_df \
    .withColumn("Rank", row_number().over(window_spec)) \
    .filter(col("Rank") <= 3) \
    .orderBy("Year", "Rank")

# Show results
ranked_data_df.show()

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken (parquet): {elapsed_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+------------------+----+
|Year|       Area|        Percentage|Rank|
+----+-----------+------------------+----+
|2010|    Rampart|32.947355855318136|   1|
|2010|    Olympic|31.962706191728426|   2|
|2010|     Harbor| 29.63203463203463|   3|
|2011|    Olympic|35.212167689161554|   1|
|2011|    Rampart|32.511779630300836|   2|
|2011|     Harbor| 28.65220520201501|   3|
|2012|    Olympic|34.414818310523835|   1|
|2012|    Rampart|  32.9464181029429|   2|
|2012|     Harbor|29.815133276010318|   3|
|2013|    Olympic| 33.52812271731191|   1|
|2013|    Rampart| 32.08287360549222|   2|
|2013|     Harbor|29.164224592662055|   3|
|2014|   Van Nuys| 31.80567315834039|   1|
|2014|West Valley|31.311989956057754|   2|
|2014|    Mission|31.162790697674417|   3|
|2015|   Van Nuys|32.641346981727736|   1|
|2015|West Valley|30.275974025974026|   2|
|2015|    Mission|30.179460678380156|   3|
|2016|   Van Nuys|31.880755720117726|   1|
|2016|West Valley| 31.54798761609907|   2|
+----+-----