## 최저 온도 구하기
- 1800.csv 파일 활용
- 지역별 최저 온도 구하기
- ``DataFrame API `` 활용하기

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark =SparkSession.builder.appName("low-tempeature").getOrCreate()

In [3]:
pwd

'c:\\Users\\apfhd\\SparkExam'

In [4]:
directory='c://Users//apfhd//SparkExam/data/1800.csv'
sdf=spark.read.csv(f'file:///{directory}')
sdf.show(5)

+-----------+--------+----+----+----+----+---+----+
|        _c0|     _c1| _c2| _c3| _c4| _c5|_c6| _c7|
+-----------+--------+----+----+----+----+---+----+
|ITE00100554|18000101|TMAX| -75|null|null|  E|null|
|ITE00100554|18000101|TMIN|-148|null|null|  E|null|
|GM000010962|18000101|PRCP|   0|null|null|  E|null|
|EZE00100082|18000101|TMAX| -86|null|null|  E|null|
|EZE00100082|18000101|TMIN|-135|null|null|  E|null|
+-----------+--------+----+----+----+----+---+----+
only showing top 5 rows



In [5]:
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField,StringType,FloatType,IntegerType
temp_schema= StructType([
    StructField('StationID',StringType(),True),
    StructField('date',IntegerType(),True),
    StructField('measure_type',StringType(),True),
    StructField('temperature',FloatType(),True)
])

In [6]:
# DataFrame 만들기
sdf=spark.read.csv(f'file:///{directory}',schema=temp_schema)
sdf.show(5)

+-----------+--------+------------+-----------+
|  StationID|    date|measure_type|temperature|
+-----------+--------+------------+-----------+
|ITE00100554|18000101|        TMAX|      -75.0|
|ITE00100554|18000101|        TMIN|     -148.0|
|GM000010962|18000101|        PRCP|        0.0|
|EZE00100082|18000101|        TMAX|      -86.0|
|EZE00100082|18000101|        TMIN|     -135.0|
+-----------+--------+------------+-----------+
only showing top 5 rows



In [7]:
# 최저온도만 가져오기
min_temp= sdf.filter(sdf.measure_type =="TMIN")
min_temp.show(5)

+-----------+--------+------------+-----------+
|  StationID|    date|measure_type|temperature|
+-----------+--------+------------+-----------+
|ITE00100554|18000101|        TMIN|     -148.0|
|EZE00100082|18000101|        TMIN|     -135.0|
|ITE00100554|18000102|        TMIN|     -125.0|
|EZE00100082|18000102|        TMIN|     -130.0|
|ITE00100554|18000103|        TMIN|      -46.0|
+-----------+--------+------------+-----------+
only showing top 5 rows



In [8]:
# 추가하면 좋을 것! -> 필요한 데이터만 뽑자
min_temp=min_temp.select('StationID','temperature')
min_temp.show(5)

+-----------+-----------+
|  StationID|temperature|
+-----------+-----------+
|ITE00100554|     -148.0|
|EZE00100082|     -135.0|
|ITE00100554|     -125.0|
|EZE00100082|     -130.0|
|ITE00100554|      -46.0|
+-----------+-----------+
only showing top 5 rows



In [9]:
min_temp= min_temp.groupBy("StationID").min("temperature").alias("min_temperature")
min_temp.show()

+-----------+----------------+
|  StationID|min(temperature)|
+-----------+----------------+
|ITE00100554|          -148.0|
|EZE00100082|          -135.0|
+-----------+----------------+



In [10]:
# 컬럼 추가하기
min_temp_f = min_temp.withColumn('f_temperature',
                                func.round(func.col("min(temperature)")*0.1*(9.0/5.0)+32.0,2))
min_temp_f.show()

+-----------+----------------+-------------+
|  StationID|min(temperature)|f_temperature|
+-----------+----------------+-------------+
|ITE00100554|          -148.0|         5.36|
|EZE00100082|          -135.0|          7.7|
+-----------+----------------+-------------+



In [11]:
spark.stop()