In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder
        .appName("Analyzing Fire Calls for Service data")
        .getOrCreate())

23/05/10 00:10:32 WARN Utils: Your hostname, wedivv-H110M-S2V resolves to a loopback address: 127.0.1.1; using 192.168.1.44 instead (on interface wlp5s0)
23/05/10 00:10:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/10 00:10:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
fire_df = spark.read.parquet("./data/5-Fire_Calls/*")

                                                                                

In [4]:
fire_df.columns

['call_number',
 'unit_id',
 'incident_number',
 'call_type',
 'call_date',
 'watch_date',
 'call_final_disposition',
 'available_dttm',
 'address',
 'city',
 'zipcode_of_incident',
 'battalion',
 'station_area',
 'box',
 'original_priority',
 'priority',
 'final_priority',
 'als_unit',
 'call_type_group',
 'number_of_alarms',
 'unit_type',
 'unit_sequence_in_call_dispatch',
 'fire_prevention_district',
 'supervisor_district',
 'neighborhoods_analysis_boundaries',
 'case_location',
 'rowid']

In [5]:
from pyspark.sql.functions import col

fewFireDF = (fire_df
    .select("incident_number", "available_dttm", "call_type")
    .where(col("call_type") != "Medical Incident"))

fewFireDF.show(5, False)


+---------------+-----------------------+------------+
|incident_number|available_dttm         |call_type   |
+---------------+-----------------------+------------+
|20136635       |2020-11-30T11:01:50.000|Alarms      |
|21049045       |2021-04-24T08:20:17.000|Water Rescue|
|21049045       |2021-04-24T08:20:15.000|Water Rescue|
|21049045       |2021-04-24T08:19:54.000|Water Rescue|
|21049045       |2021-04-24T08:19:33.000|Water Rescue|
+---------------+-----------------------+------------+
only showing top 5 rows



In [6]:
from pyspark.sql.functions import *
(
    fewFireDF
    .select("call_type")
    .where(col("call_type").isNotNull())
    .distinct()
).show(truncate=False)

+----------------------------------+
|call_type                         |
+----------------------------------+
|Elevator / Escalator Rescue       |
|Alarms                            |
|Odor (Strange / Unknown)          |
|Citizen Assist / Service Call     |
|HazMat                            |
|Explosion                         |
|Vehicle Fire                      |
|Suspicious Package                |
|Other                             |
|Outside Fire                      |
|Traffic Collision                 |
|Assist Police                     |
|Gas Leak (Natural and LP Gases)   |
|Water Rescue                      |
|Electrical Hazard                 |
|Structure Fire                    |
|Industrial Accidents              |
|Mutual Aid / Assist Outside Agency|
|Fuel Spill                        |
|Smoke Investigation (Outside)     |
+----------------------------------+
only showing top 20 rows



In [7]:
from pyspark.sql.functions import *
(
    fewFireDF
    .select("call_type")
    .where(col("call_type").isNotNull())
    .agg(count_distinct('call_type').alias('DistinctCallTypes'))
   
).show()

+-----------------+
|DistinctCallTypes|
+-----------------+
|               24|
+-----------------+



In [8]:
(fire_df
    .select("call_date", "watch_date", "available_dttm")
).show(5, truncate=False)

+-----------------------+-----------------------+-----------------------+
|call_date              |watch_date             |available_dttm         |
+-----------------------+-----------------------+-----------------------+
|2021-04-24T00:00:00.000|2021-04-24T00:00:00.000|2021-04-24T10:26:57.000|
|2021-04-24T00:00:00.000|2021-04-24T00:00:00.000|2021-04-24T10:46:41.000|
|2020-11-30T00:00:00.000|2020-11-30T00:00:00.000|2020-11-30T11:01:50.000|
|2021-04-24T00:00:00.000|2021-04-24T00:00:00.000|2021-04-24T10:32:33.000|
|2021-04-24T00:00:00.000|2021-04-24T00:00:00.000|2021-04-24T09:50:34.000|
+-----------------------+-----------------------+-----------------------+
only showing top 5 rows



In [9]:
fire_ts_df = (
    fire_df
    .withColumn("IncidentDate", to_timestamp(col("call_date"), "yyyy-MM-dd'T'HH:mm:ss.SSS"))
    .drop("call_date")
    .withColumn("OnWatchDate", to_timestamp(col("watch_date"), "yyyy-MM-dd'T'HH:mm:ss.SSS"))
    .drop("watch_date")
    .withColumn("AvailableDtTS", to_timestamp(col("available_dttm"), "yyyy-MM-dd'T'HH:mm:ss.SSS"))   
    .drop("available_dttm")
)


In [10]:
(fire_ts_df
 .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
 ).show(3)

+-------------------+-------------------+-------------------+
|       IncidentDate|        OnWatchDate|      AvailableDtTS|
+-------------------+-------------------+-------------------+
|2021-04-24 00:00:00|2021-04-24 00:00:00|2021-04-24 10:26:57|
|2021-04-24 00:00:00|2021-04-24 00:00:00|2021-04-24 10:46:41|
|2020-11-30 00:00:00|2020-11-30 00:00:00|2020-11-30 11:01:50|
+-------------------+-------------------+-------------------+
only showing top 3 rows



In [11]:
(fire_ts_df
    .select(year('IncidentDate'))
    .distinct()
    .orderBy(year('IncidentDate'))
    ).show()

+------------------+
|year(IncidentDate)|
+------------------+
|              2020|
|              2021|
|              2022|
+------------------+



In [12]:
from pyspark.sql.functions import year

(fire_ts_df
    .select(year('IncidentDate').alias('Year'), 'call_number')
    .groupBy('Year')
    .count()
    .orderBy('count', ascending=True)
).show()


+----+-----+
|Year|count|
+----+-----+
|2020|  236|
|2022|10564|
|2021|14200|
+----+-----+



In [13]:
(fire_ts_df
    .select("call_type")
    .where(col("call_type").isNotNull())
    .groupBy("call_type")
    .count()
    .orderBy("count", ascending=False)
 
 ).show(10, False)

+-------------------------------+-----+
|call_type                      |count|
+-------------------------------+-----+
|Medical Incident               |16865|
|Alarms                         |3355 |
|Structure Fire                 |1601 |
|Traffic Collision              |931  |
|Other                          |539  |
|Outside Fire                   |487  |
|Citizen Assist / Service Call  |367  |
|Gas Leak (Natural and LP Gases)|179  |
|Water Rescue                   |163  |
|Electrical Hazard              |162  |
+-------------------------------+-----+
only showing top 10 rows



In [14]:
import pyspark.sql.functions as F

(fire_ts_df
    .select(F.sum("number_of_alarms"),
            F.max("number_of_alarms"),
            F.min("number_of_alarms")
            )
 ).show()

+---------------------+---------------------+---------------------+
|sum(number_of_alarms)|max(number_of_alarms)|min(number_of_alarms)|
+---------------------+---------------------+---------------------+
|              25109.0|                    3|                    1|
+---------------------+---------------------+---------------------+



In [15]:
(fire_ts_df
    .select(year('IncidentDate').alias('Year'), 'call_number')
    .groupBy('Year')
    .count()
    .orderBy('count', ascending=True)
).show()


+----+-----+
|Year|count|
+----+-----+
|2020|  236|
|2022|10564|
|2021|14200|
+----+-----+



# exploratory data analysis

What were all the different types of fire calls in 2021?

In [16]:
(fire_ts_df
    .select("call_type")
    .distinct()
    .where(year("IncidentDate") == 2021)
).show(truncate=False)

+--------------------------------------------+
|call_type                                   |
+--------------------------------------------+
|Odor (Strange / Unknown)                    |
|Citizen Assist / Service Call               |
|Suspicious Package                          |
|Other                                       |
|Outside Fire                                |
|Traffic Collision                           |
|Assist Police                               |
|Gas Leak (Natural and LP Gases)             |
|Water Rescue                                |
|Electrical Hazard                           |
|Structure Fire                              |
|Medical Incident                            |
|Mutual Aid / Assist Outside Agency          |
|Smoke Investigation (Outside)               |
|Confined Space / Structure Collapse         |
|Watercraft in Distress                      |
|Extrication / Entrapped (Machinery, Vehicle)|
+--------------------------------------------+



What months within the year 2021 saw the highest number of fire calls?

In [17]:
# type of fire calls

(fire_ts_df
    .select("call_type")
    .distinct()
    .filter(col("call_type").like("%Fire%"))
    .where(year("IncidentDate") == 2021)

).show()

+--------------+
|     call_type|
+--------------+
|  Outside Fire|
|Structure Fire|
+--------------+



In [18]:
(fire_ts_df
    .select(month("IncidentDate").alias('month'), date_format("IncidentDate", "MMMM").alias("month_name"), 'call_type')
    .where(year("IncidentDate") == 2021)
    .filter(col("call_type").like("%Fire%"))
    .groupBy("month", "month_name")
    .count()
    .orderBy(desc("count"))

).show()

+-----+----------+-----+
|month|month_name|count|
+-----+----------+-----+
|    7|      July|  575|
|    4|     April|  367|
|   10|   October|  197|
|    5|       May|   34|
|    1|   January|   21|
|    6|      June|    7|
|    2|  February|    2|
+-----+----------+-----+



Which neighborhood in San Francisco generated the most fire calls in 2021?

In [19]:
(fire_ts_df
    .select(col('neighborhoods_analysis_boundaries').alias('neighborhood'))
    .where((col('city') == 'San Francisco') & (year("IncidentDate") == 2021))
    .filter(col("call_type").like("%Fire%"))
    .groupBy('neighborhood')
    .count()
    .orderBy(desc("count"))

).show(5, truncate=False)


+---------------------+-----+
|neighborhood         |count|
+---------------------+-----+
|Bayview Hunters Point|161  |
|Tenderloin           |109  |
|Mission              |98   |
|Bernal Heights       |70   |
|Nob Hill             |54   |
+---------------------+-----+
only showing top 5 rows



Which week in the year 2021 had the most fire calls?

In [20]:
(fire_ts_df
    .select(weekofyear(col("IncidentDate")).alias("week"))
    .where(year("IncidentDate") == 2021)
    .filter(col("call_type").like("%Fire%"))
    .groupBy("week")
    .count()
    .orderBy(desc('count'))

).show(5)

+----+-----+
|week|count|
+----+-----+
|  16|  352|
|  26|  313|
|  27|  228|
|  40|  135|
|  41|   43|
+----+-----+
only showing top 5 rows



Is there a correlation between neighborhood, zip code, and number of fire calls?

In [21]:
neighbor_calls = (fire_ts_df
    .select(col("neighborhoods_analysis_boundaries").alias("neighborhood"))
    .filter(col("call_type").like("%Fire%"))
    .groupBy("neighborhood")
    .count()
    .orderBy(desc("count"))
)

In [22]:
zipcode_calls = (fire_ts_df
    .select(col("zipcode_of_incident").alias("zipcode"))
    .filter(col("call_type").like("%Fire%"))
    .groupBy("zipcode")
    .count()
    .orderBy(desc("count"))
)

#todo

In [None]:
# EXAMPLE OF CORRELATION, gepito 

from pyspark.sql.functions import col, corr

# create a DataFrame from your data
data = [
    ('Excelsior', '94112', 10),
    ('Sunset', '94122', 5),
    ('Richmond', '94118', 3),
    ('Marina', '94123', 7),
    ('Mission', '94110', 15),
    ('Bernal Heights', '94110', 8),
    ('Castro/Upper Market', '94114', 6),
    ('Outer Mission', '94112', 12),
    ('Twin Peaks', '94131', 4),
    ('Pacific Heights', '94115', 2)
]

df = spark.createDataFrame(data, ['neighborhood', 'zipcode', 'num_calls'])

# create a vector column for each categorical variable using StringIndexer
from pyspark.ml.feature import StringIndexer, VectorAssembler

indexer = StringIndexer(inputCol='neighborhood', outputCol='neighborhood_index')
df = indexer.fit(df).transform(df)

indexer = StringIndexer(inputCol='zipcode', outputCol='zipcode_index')
df = indexer.fit(df).transform(df)

# create a vector column for the features
assembler = VectorAssembler(inputCols=['neighborhood_index', 'zipcode_index'], outputCol='features')
df = assembler.transform(df)

# compute the correlations
neighborhood_vec = col('neighborhood_index')
zipcode_vec = col('zipcode_index')
num_calls = col('num_calls')

correlation = df.select(
    corr(neighborhood_vec, num_calls), 
    corr(zipcode_vec, num_calls)
).first()

print(f"Correlation between neighborhood and number of fire calls: {correlation[0]}")
print(f"Correlation between zip code and number of fire calls: {correlation[1]}")


                                                                                

Correlation between neighborhood and number of fire calls: -0.4264014327112209
Correlation between zip code and number of fire calls: -0.6610536903660496
