In [17]:
from pyspark.sql.functions import to_date, to_timestamp, round

In [8]:
from databricks.connect import DatabricksSession
spark = DatabricksSession.builder.serverless().getOrCreate()


In [16]:
raw_fire_df = spark.read.format ("csv") \
.option ("header", "true") \
.option ("inferSchema", "true") \
.load("/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv")

raw_fire_df.show(10, truncate=False)

+-----------+-------+---------------+----------------+----------+----------+----------------------+----------------------+---------------------------+----+-------------------+---------+------------+----+------------+--------+--------------+--------+---------------+---------+------------+------------------------------+------------------------+-------------------+---------------+-------------------------------------+-------------+------------------+
|Call Number|Unit ID|Incident Number|CallType        |Call Date |Watch Date|Call Final Disposition|Available DtTm        |Address                    |City|Zipcode of Incident|Battalion|Station Area|Box |OrigPriority|Priority|Final Priority|ALS Unit|Call Type Group|NumAlarms|UnitType    |Unit sequence in call dispatch|Fire Prevention District|Supervisor District|Neighborhood   |Location                             |RowID        |Delay             |
+-----------+-------+---------------+----------------+----------+----------+--------------------

In [10]:
 raw_fire_df.printSchema()

root
 |-- Call Number: integer (nullable = true)
 |-- Unit ID: string (nullable = true)
 |-- Incident Number: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- Call Date: date (nullable = true)
 |-- Watch Date: date (nullable = true)
 |-- Call Final Disposition: string (nullable = true)
 |-- Available DtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode of Incident: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- Station Area: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OrigPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- Final Priority: integer (nullable = true)
 |-- ALS Unit: boolean (nullable = true)
 |-- Call Type Group: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- Unit sequence in call dispatch: integer (nullable = true)
 |-- Fire Prevention District: str

In [18]:
renamed_fire_df = raw_fire_df \
    .withColumnRenamed("Call Number", "CallNumber") \
    .withColumnRenamed("Unit ID", "UnitID") \
    .withColumnRenamed("Incident Number", "IncidentNumber") \
    .withColumnRenamed("Call Date", "CallDate") \
    .withColumnRenamed("Watch Date", "WatchDate") \
    .withColumnRenamed("Call Final Disposition", "CallFinalDisposition") \
    .withColumnRenamed("Available DtTm", "AvailableDtTm") \
    .withColumnRenamed("Zipcode of Incident", "Zipcode") \
    .withColumnRenamed("Station Area", "StationArea") \
    .withColumnRenamed("Final Priority", "FinalPriority") \
    .withColumnRenamed("ALS Unit", "ALSUnit") \
    .withColumnRenamed("Call Type Group", "CallTypeGroup") \
    .withColumnRenamed("Unit sequence in call dispatch", "UnitSequenceInCallDispatch") \
    .withColumnRenamed("Fire Prevention District", "FirePreventionDistrict") \
    .withColumnRenamed("Supervisor District", "SupervisorDistrict")

In [20]:
fire_df = renamed_fire_df \
    .withColumn("CallDate", to_date("CallDate", "MM/dd/yyyy")) \
    .withColumn("WatchDate", to_date("WatchDate", "MM/dd/yyyy")) \
    .withColumn("AvailableDtTm", to_timestamp("AvailableDtTm", "MM/dd/yyyy hh:mm:ss a")) \
    .withColumn("Delay", round("Delay", 2))

fire_df.show(10, truncate=False)

+----------+------+--------------+----------------+----------+----------+--------------------+-------------------+---------------------------+----+-------+---------+-----------+----+------------+--------+-------------+-------+-------------+---------+------------+--------------------------+----------------------+------------------+---------------+-------------------------------------+-------------+-----+
|CallNumber|UnitID|IncidentNumber|CallType        |CallDate  |WatchDate |CallFinalDisposition|AvailableDtTm      |Address                    |City|Zipcode|Battalion|StationArea|Box |OrigPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType    |UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|Neighborhood   |Location                             |RowID        |Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+-------------------+---------------------------+----+-------+---------+-----------+----+----

In [14]:
q1_df=fire_df.where("CallType is not null").select("CallType").distinct().count()
print(q1_df)

32


+----------+------+--------------+----------------+----------+----------+--------------------+-------------------+---------------------------+----+-------+---------+-----------+----+------------+--------+-------------+-------+-------------+---------+------------+--------------------------+----------------------+------------------+---------------+-------------------------------------+-------------+-----+
|CallNumber|UnitID|IncidentNumber|CallType        |CallDate  |WatchDate |CallFinalDisposition|AvailableDtTm      |Address                    |City|Zipcode|Battalion|StationArea|Box |OrigPriority|Priority|FinalPriority|ALSUnit|CallTypeGroup|NumAlarms|UnitType    |UnitSequenceInCallDispatch|FirePreventionDistrict|SupervisorDistrict|Neighborhood   |Location                             |RowID        |Delay|
+----------+------+--------------+----------------+----------+----------+--------------------+-------------------+---------------------------+----+-------+---------+-----------+----+----