In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setAppName('DataSet').setMaster('local[2]')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

# DataFrame
데이타프레임은 관계형 데이터베이스의 테이블에서 컬럼으로 구서된 병경 불가능한 분산 데이터 컬렉션이다.
RDD의 약점 : 스파크 api를 파이썬에서 실행하는 것은 자바 jvm과 PY4J사이의 커뮤니케이션 오버헤드 발생으로 성능 저하가 생김

In [12]:
stringJSONRDD = sc.parallelize((
    """
    {'id': '123',
    'name': 'Katie',
    'age': 19,
    'eyeColor': 'brown'}
    """,
    """
    {'id': '234',
    'name': 'Michael',
    'age': 22,
    'eyeColor': 'green'}
    """,
    """
    {'id': '345',
    'name': 'Simone',
    'age': 23,
    'eyeColor': 'blue'}
    """))

In [13]:
swimmersJSON = spark.read.json(stringJSONRDD)

In [14]:
type(stringJSONRDD)

pyspark.rdd.RDD

In [15]:
type(swimmersJSON)

pyspark.sql.dataframe.DataFrame

In [None]:
dir(swimmersJSON)

In [16]:
swimmersJSON.createOrReplaceTempView('swimmersJSON')

In [17]:
swimmersJSON.show()

+---+--------+---+-------+
|age|eyeColor| id|   name|
+---+--------+---+-------+
| 19|   brown|123|  Katie|
| 22|   green|234|Michael|
| 23|    blue|345| Simone|
+---+--------+---+-------+



In [18]:
spark.sql('select * from swimmersJSON').collect()

[Row(age=19, eyeColor='brown', id='123', name='Katie'),
 Row(age=22, eyeColor='green', id='234', name='Michael'),
 Row(age=23, eyeColor='blue', id='345', name='Simone')]

In [19]:
spark.sql('select * from swimmersJSON where id="123"').collect()

[Row(age=19, eyeColor='brown', id='123', name='Katie')]

In [20]:
swimmersJSON.printSchema()

root
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



In [21]:
from pyspark.sql.types import *

In [22]:
stringCSVRDD = sc.parallelize([
    (123, 'Katie', 19, 'brown'),
    (234, 'Michael', 22, 'green'),
    (345, 'Simone', 23, 'blue')
])

In [23]:
schema = StructType([
    StructField('id', LongType(), True),
    StructField('name', StringType(), True),
    StructField('age', LongType(), True),
    StructField('eyeColor', StringType(), True)
])

In [24]:
swimmers = spark.createDataFrame(stringCSVRDD, schema)

In [25]:
swimmers.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)



Querying with SQL

In [None]:
# DataBricks notebook에서 지원하는 매직 키워드
%sql
select * from swimmers

In [28]:
swimmers.createOrReplaceTempView('swimmers')

In [29]:
spark.sql('select * from swimmers').show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [30]:
spark.sql('select count(*) from swimmers').show()

+--------+
|count(1)|
+--------+
|       3|
+--------+



In [31]:
spark.sql('select id, age from swimmers').show()

+---+---+
| id|age|
+---+---+
|123| 19|
|234| 22|
|345| 23|
+---+---+



In [32]:
spark.sql('select id, age from swimmers where age = 22').show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [33]:
spark.sql('select name, eyeColor from swimmers where eyeColor like "b%"').show()

+------+--------+
|  name|eyeColor|
+------+--------+
| Katie|   brown|
|Simone|    blue|
+------+--------+



Querying with the DataFram API

In [34]:
display(swimmers)

DataFrame[id: bigint, name: string, age: bigint, eyeColor: string]

In [35]:
swimmers.count()

3

In [36]:
swimmers.select('id', 'age').show()

+---+---+
| id|age|
+---+---+
|123| 19|
|234| 22|
|345| 23|
+---+---+



In [37]:
swimmers.select('id', 'age').filter('age = 22').show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [38]:
filightPerfFilePath = './departuredelays.csv'
aiportsFilePath = './airport-codes-na.txt'

spark.read.csv()
- 첫번째 : 데이터 파일 경로
- 두번째 : 첫번째 줄이 헤더인지 여부
- 세번째 : 스파크가리풀렉션으로 데이터 타입 추측 여부
- 네번째 : 구분자 지정

In [39]:
airports = spark.read.csv(aiportsFilePath, header='true', inferSchema='true', sep='\t')

In [40]:
airports.createOrReplaceTempView('airports')

In [41]:
flightPerf = spark.read.csv(filightPerfFilePath, header='true')

In [42]:
flightPerf.createOrReplaceTempView('FlightPerformance')

In [43]:
flightPerf.cache()

DataFrame[date: string, delay: string, distance: string, origin: string, destination: string]

In [44]:
spark.read.format('com.databricks.spark.csv').option('header', 'true').load(aiportsFilePath)

DataFrame[City	State	Country	IATA: string]

In [45]:
spark.sql('select * from airports where State = "WA"').show()

+-----------+-----+-------+----+
|       City|State|Country|IATA|
+-----------+-----+-------+----+
| Bellingham|   WA|    USA| BLI|
| Moses Lake|   WA|    USA| MWH|
|      Pasco|   WA|    USA| PSC|
|    Pullman|   WA|    USA| PUW|
|    Seattle|   WA|    USA| SEA|
|    Spokane|   WA|    USA| GEG|
|Walla Walla|   WA|    USA| ALW|
|  Wenatchee|   WA|    USA| EAT|
|     Yakima|   WA|    USA| YKM|
+-----------+-----+-------+----+



In [46]:
airports.select('*').filter('State = "WA"').show()

+-----------+-----+-------+----+
|       City|State|Country|IATA|
+-----------+-----+-------+----+
| Bellingham|   WA|    USA| BLI|
| Moses Lake|   WA|    USA| MWH|
|      Pasco|   WA|    USA| PSC|
|    Pullman|   WA|    USA| PUW|
|    Seattle|   WA|    USA| SEA|
|    Spokane|   WA|    USA| GEG|
|Walla Walla|   WA|    USA| ALW|
|  Wenatchee|   WA|    USA| EAT|
|     Yakima|   WA|    USA| YKM|
+-----------+-----+-------+----+



In [48]:
# 워싱턴에서 출발하는 비행기 중에서 지연이 됀 경로 검색
spark.sql('select a.City, f.origin, sum(f.delay) as Delays \
            from FlightPerformance f \
            join airports a on a.IATA = f.origin \
            where a.State = "WA" \
            group by a.City, f.origin \
            order by sum(f.delay) desc').show()

+-------+------+--------+
|   City|origin|  Delays|
+-------+------+--------+
|Seattle|   SEA|159086.0|
|Spokane|   GEG| 12404.0|
|  Pasco|   PSC|   949.0|
+-------+------+--------+



In [49]:
# 미국의 주별 지연 정보 검색
spark.sql('select a.City, sum(f.delay) as Delays \
            from FlightPerformance f \
            join airports a on a.IATA = f.origin \
            where a.Country = "USA" \
            group by a.City, f.origin \
            order by sum(f.delay) desc').show()

+---------------+---------+
|           City|   Delays|
+---------------+---------+
|        Chicago|1193929.0|
|        Atlanta|1151087.0|
|         Denver| 899406.0|
|         Dallas| 679249.0|
|        Houston| 587978.0|
|    Los Angeles| 565490.0|
|  San Francisco| 501670.0|
|         Newark| 452791.0|
|        Orlando| 445070.0|
|      Las Vegas| 441797.0|
|        Chicago| 394254.0|
|       New York| 387929.0|
|        Phoenix| 364123.0|
|      Baltimore| 362845.0|
|        Detroit| 305426.0|
|       New York| 303888.0|
|Fort Lauderdale| 297358.0|
|  Washington DC| 260151.0|
|    Minneapolis| 250779.0|
|      Charlotte| 246558.0|
+---------------+---------+
only showing top 20 rows

