In [1]:
pwd

'C:\\Users\\syed3\\Downloads\\LearningSparkV2-master\\LearningSparkV2-master\\chapter3'

In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import *
# Programmatic way to define a schema
spark = (SparkSession
    .builder
    .appName("schema")
    .getOrCreate())
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
    StructField('UnitID', StringType(), True),
    StructField('IncidentNumber', IntegerType(), True),
    StructField('CallType', StringType(), True),
    StructField('CallDate', StringType(), True),
    StructField('WatchDate', StringType(), True),
    StructField('CallFinalDisposition', StringType(), True),
    StructField('AvailableDtTm', StringType(), True),
    StructField('Address', StringType(), True),
    StructField('City', StringType(), True),
    StructField('Zipcode', IntegerType(), True),
    StructField('Battalion', StringType(), True),
    StructField('StationArea', StringType(), True),
    StructField('Box', StringType(), True),
    StructField('OriginalPriority', StringType(), True),
    StructField('Priority', StringType(), True),
    StructField('FinalPriority', IntegerType(), True),
    StructField('ALSUnit', BooleanType(), True),
    StructField('CallTypeGroup', StringType(), True),
    StructField('NumAlarms', IntegerType(), True),
    StructField('UnitType', StringType(), True),
    StructField('UnitSequenceInCallDispatch', IntegerType(), True),
    StructField('FirePreventionDistrict', StringType(), True),
    StructField('SupervisorDistrict', StringType(), True),
    StructField('Neighborhood', StringType(), True),
    StructField('Location', StringType(), True),
    StructField('RowID', StringType(), True),
    StructField('Delay', FloatType(), True)])
# Use the DataFrameReader interface to read a CSV file
sf_fire_file = "data/sf-fire-calls.csv"
fire_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema)

In [6]:
fire_df.schema

StructType(List(StructField(CallNumber,IntegerType,true),StructField(UnitID,StringType,true),StructField(IncidentNumber,IntegerType,true),StructField(CallType,StringType,true),StructField(CallDate,StringType,true),StructField(WatchDate,StringType,true),StructField(CallFinalDisposition,StringType,true),StructField(AvailableDtTm,StringType,true),StructField(Address,StringType,true),StructField(City,StringType,true),StructField(Zipcode,IntegerType,true),StructField(Battalion,StringType,true),StructField(StationArea,StringType,true),StructField(Box,StringType,true),StructField(OriginalPriority,StringType,true),StructField(Priority,StringType,true),StructField(FinalPriority,IntegerType,true),StructField(ALSUnit,BooleanType,true),StructField(CallTypeGroup,StringType,true),StructField(NumAlarms,IntegerType,true),StructField(UnitType,StringType,true),StructField(UnitSequenceInCallDispatch,IntegerType,true),StructField(FirePreventionDistrict,StringType,true),StructField(SupervisorDistrict,Strin

In [9]:
# projection and filters
from pyspark.sql.functions import *
few_fire_df = (fire_df
 .select("IncidentNumber", "AvailableDtTm", "CallType")
 .where(col("CallType") != "Medical Incident"))
few_fire_df.show(5, truncate=False)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



In [16]:
#if we want to know how many distinct CallTypes were recorded as the causes of the fire calls
# In Python, return number of distinct types of calls using countDistinct()
from pyspark.sql.functions import *
(fire_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .agg(countDistinct("CallType").alias("DistinctCallTypes"))
 .show())
#fire_df.printSchema()

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+



In [17]:
(fire_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .distinct()
 .show(10, False))

+-----------------------------+
|CallType                     |
+-----------------------------+
|Elevator / Escalator Rescue  |
|Marine Fire                  |
|Aircraft Emergency           |
|Administrative               |
|Alarms                       |
|Odor (Strange / Unknown)     |
|Citizen Assist / Service Call|
|HazMat                       |
|Watercraft in Distress       |
|Explosion                    |
+-----------------------------+
only showing top 10 rows



## Because DataFrame transformations are immutable, when werename a column using withColumnRenamed() we get a new Data‐Frame while retaining the original with the old column name

## Q-3) Find out all response or delayed times greater than 5 mins?

Rename the column Delay - > ReponseDelayedinMins

Returns a new DataFrame 

Find out all calls where the response time to the fire site was delayed for more than 5 mins

In [18]:
#renaming
new_fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")
(new_fire_df
 .select("ResponseDelayedinMins")
 .where(col("ResponseDelayedinMins") > 5)
 .show(5, False))

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



In [19]:
new_fire_df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

##Let's do some ETL:

Transform the string dates to Spark Timestamp data type so we can make some time-based queries later <br>
Returns a transformed query<br>
Cache the new DataFram

In [25]:
# drop and renaming column
fire_ts_df = (new_fire_df
 .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
 .drop("CallDate")
 .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
 .drop("WatchDate")
 .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"),
 "MM/dd/yyyy hh:mm:ss a"))
 .drop("AvailableDtTm"))

fire_ts_df.cache()
fire_ts_df.columns

(fire_ts_df
 .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
 .show(5, False))

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [22]:
fire_ts_df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 |-- SupervisorDistrict: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Location: string (nullable =

In [23]:
# aggregate (groupBy(),count())
#what were the most common types of fire calls?
(fire_ts_df
 .select("CallType")
 .where(col("CallType").isNotNull())
 .groupBy("CallType")
 .count()
 .orderBy("count", ascending=False)
 .show(n=10, truncate=False))

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



In [24]:
#Here we compute the sum of alarms, the average response time, and the minimum
#and maximum response times to all fire calls 
# In Python
import pyspark.sql.functions as F
(fire_ts_df
 .select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),
 F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
 .show())

+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



## Q-4a) What zip codes accounted for most common calls?

Let's investigate what zip codes in San Francisco accounted for most fire calls and what type where they.<br>

Filter out by CallType<br>
Group them by CallType and Zip code <br>
Count them and display them in descending order<br>
It seems like the most common calls were all related to Medical Incident, and<br>the two zip codes are 94102 and 94103.

In [28]:
(fire_ts_df
 .select("CallType", "ZipCode")
 .where(col("CallType").isNotNull())
 .groupBy("CallType", "Zipcode")
 .count()
 .orderBy("count", ascending=False)
 .show(10, truncate=False))

+----------------+-------+-----+
|CallType        |Zipcode|count|
+----------------+-------+-----+
|Medical Incident|94102  |16130|
|Medical Incident|94103  |14775|
|Medical Incident|94110  |9995 |
|Medical Incident|94109  |9479 |
|Medical Incident|94124  |5885 |
|Medical Incident|94112  |5630 |
|Medical Incident|94115  |4785 |
|Medical Incident|94122  |4323 |
|Medical Incident|94107  |4284 |
|Medical Incident|94133  |3977 |
+----------------+-------+-----+
only showing top 10 rows



## Q-4b) What San Francisco neighborhoods are in the zip codes 94102 and 94103
<br>
Let's find out the neighborhoods associated with these two zip codes. In all<br> likelihood, these are some of the contested neighborhood with high reported crimes.

In [29]:
fire_ts_df.select("Neighborhood", "Zipcode").where((col("Zipcode") == 94102) |
                                                   (col("Zipcode") == 94103)).distinct().show(10, truncate=False)

+------------------------------+-------+
|Neighborhood                  |Zipcode|
+------------------------------+-------+
|Potrero Hill                  |94103  |
|Western Addition              |94102  |
|Tenderloin                    |94102  |
|Nob Hill                      |94102  |
|Castro/Upper Market           |94103  |
|South of Market               |94102  |
|South of Market               |94103  |
|Hayes Valley                  |94103  |
|Financial District/South Beach|94102  |
|Mission Bay                   |94103  |
+------------------------------+-------+
only showing top 10 rows



## Q-5) What was the sum of all calls, average, min and max of the response times for calls?

Let's use the built-in Spark SQL functions to compute the sum, avg, min, and max of few columns:<br>

Number of Total Alarms<br>
What were the min and max the delay in response time before the Fire Dept<br>
arrived at the scene of the call

In [30]:
fire_ts_df.select(sum("NumAlarms"), avg("ResponseDelayedinMins"), 
    min("ResponseDelayedinMins"), max("ResponseDelayedinMins")).show()

+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



##  Q-6a) How many distinct years of data is in the CSV file?<br>
We can use the year() SQL Spark function off the Timestamp column data type IncidentDate.<br>

In all, we have fire calls from years 2000-2018 

In [35]:
fire_ts_df.select(year("IncidentDate")).distinct().orderBy(year("IncidentDate")).show()

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



## Q-6b) What week of the year in 2018 had the most fire calls?**<br>
Note: Week 1 is the New Years' week and week 25 is the July 4 the week. Loads of fireworks, so it makes sense the higher number of calls.

In [43]:
fire_ts_df.filter(
year("IncidentDate") ==2018).groupBy(weekofyear("IncidentDate")).count().orderBy(
"count",ascending=False).show()

+------------------------+-----+
|weekofyear(IncidentDate)|count|
+------------------------+-----+
|                      22|  259|
|                      40|  255|
|                      43|  250|
|                      25|  249|
|                       1|  246|
|                      44|  244|
|                      13|  243|
|                      32|  243|
|                      11|  240|
|                       5|  236|
|                      18|  236|
|                      23|  235|
|                      31|  234|
|                       2|  234|
|                      42|  234|
|                      19|  233|
|                      34|  232|
|                       8|  232|
|                      10|  232|
|                      21|  231|
+------------------------+-----+
only showing top 20 rows



## ** Q-7) What neighborhoods in San Francisco had the worst response time in 2018?**<BR>

It appears that if you living in Presidio Heights, the Fire Dept arrived in less than 3 mins, while Mission Bay took more than 6 mins.

In [50]:
fire_ts_df.select("Neighborhood", "ResponseDelayedinMins").filter(
year("IncidentDate")== 2018).where(col("ResponseDelayedinMins") > 5).orderBy('ResponseDelayedinMins',ascending=False).show(10, False)

+------------------------------+---------------------+
|Neighborhood                  |ResponseDelayedinMins|
+------------------------------+---------------------+
|Chinatown                     |491.26666            |
|Financial District/South Beach|406.63333            |
|Tenderloin                    |340.48334            |
|Haight Ashbury                |175.86667            |
|Bayview Hunters Point         |155.8                |
|Financial District/South Beach|135.51666            |
|Pacific Heights               |129.01666            |
|Potrero Hill                  |109.8                |
|Inner Sunset                  |106.13333            |
|South of Market               |94.71667             |
+------------------------------+---------------------+
only showing top 10 rows



In [53]:
fire_df.explain()

SyntaxError: invalid syntax (Temp/ipykernel_6296/2191993274.py, line 1)