In [5]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("Count").master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("WARN")  

# 스키마 정의
fire_schema = StructType([
    StructField('CallNumber', IntegerType(), True), 
    StructField('UnitID', StringType(), True), 
    StructField('IncidentNumber', IntegerType(), True), 
    StructField('CallType', StringType(), True), 
    StructField('CallDate', StringType(), True), 
    StructField('WatchDate', StringType(), True), 
    StructField('CallFinalDisposition', StringType(), True), 
    StructField('AvailableDtTm', StringType(), True), 
    StructField('Address', StringType(), True), 
    StructField('City', StringType(), True), 
    StructField('Zipcode', IntegerType(), True), 
    StructField('Battalion', StringType(), True), 
    StructField('StationArea', StringType(), True), 
    StructField('Box', StringType(), True), 
    StructField('OriginalPriority', StringType(), True),
    StructField('Priority', StringType(), True), 
    StructField('FinalPriority', IntegerType(), True), 
    StructField('ALSUnit', BooleanType(), True), 
    StructField('CallTypeGroup', StringType(), True), 
    StructField('NumAlarms', IntegerType(), True), 
    StructField('UnitType', StringType(), True), 
    StructField('UnitSequenceInCallDispatch', IntegerType(), True), 
    StructField('FirePreventionDistrict', StringType(), True), 
    StructField('SupervisorDistrict', StringType(), True), 
    StructField('Neighborhood', StringType(), True), 
    StructField('Location', StringType(), True), 
    StructField('RowID', StringType(), True), 
    StructField('Delay', FloatType(), True)
])

# CSV 파일 경로
sf_fire_file = "sf-fire-calls.csv" 

# CSV 파일을 읽어서 DataFrame 생성
fire_ts_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema)





import pyspark.sql.functions as F
fire_ts_df = fire_ts_df.withColumnRenamed("Delay", "ResponseDelayedInMins")
fire_ts_df \
    .select(
        F.sum("NumAlarms"), 
        F.avg("ResponseDelayedInMins"), 
        F.min("ResponseDelayedInMins"), 
        F.max("ResponseDelayedInMins"),
	F.variance("ResponseDelayedInMins")
    ) \
    .show()


+--------------+--------------------------+--------------------------+--------------------------+-------------------------------+
|sum(NumAlarms)|avg(ResponseDelayedInMins)|min(ResponseDelayedInMins)|max(ResponseDelayedInMins)|var_samp(ResponseDelayedInMins)|
+--------------+--------------------------+--------------------------+--------------------------+-------------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|              87.95225254154934|
+--------------+--------------------------+--------------------------+--------------------------+-------------------------------+

