In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
conf = SparkConf().setAppName('DataSet').setMaster('local[2]')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

# DataFrame
데이터프레임은 관계형 데이터베이스의 테이블에서 컬럼으로 구사된 변경 불가능한 분산 데이터 컬렉션이다. \
RDD의 약점: 스파크 api를 파이썬에서 실행하는 것은 자바 jvm과 py4j 사이의 커뮤니케이션 오버헤드 발생으로 성능 저하가 생김

In [3]:
stringJSONRDD = sc.parallelize((
    """
    {'id':'123',
    'name':'Katie',
    'age':19,
    'eyeColor':'brown'}
    """,
    """
    {'id':'234',
    'name':'Michael',
    'age':22,
    'eyeColor':'green'}
    """,
    """
    {'id':'1345',
    'name':'Simone',
    'age':23,
    'eyeColor':'blue'}
    """,
))

In [4]:
stringJSONRDD

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [5]:
type(stringJSONRDD)

pyspark.rdd.RDD

In [6]:
dir(stringJSONRDD)

['__add__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getnewargs__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_computeFractionForSampleSize',
 '_defaultReducePartitions',
 '_id',
 '_is_barrier',
 '_jrdd',
 '_jrdd_deserializer',
 '_memory_limit',
 '_pickled',
 '_reserialize',
 '_to_java_object_rdd',
 'aggregate',
 'aggregateByKey',
 'barrier',
 'cache',
 'cartesian',
 'checkpoint',
 'coalesce',
 'cogroup',
 'collect',
 'collectAsMap',
 'combineByKey',
 'context',
 'count',
 'countApprox',
 'countApproxDistinct',
 'countByKey',
 'countByValue',
 'ctx',
 'distinct',
 'filter',
 'first',
 'flatMap',
 'flatMapValues',
 'fold',
 'foldByKey',
 'foreach',
 'foreachPartition',
 'fullOuterJoin',
 'getCheckpo

In [7]:
swimmersJSON = spark.read.json(stringJSONRDD)

In [8]:
swimmersJSON

DataFrame[age: bigint, eyeColor: string, id: string, name: string]

In [9]:
type(swimmersJSON)

pyspark.sql.dataframe.DataFrame

In [10]:
dir(swimmersJSON)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_collectAsArrow',
 '_jcols',
 '_jdf',
 '_jmap',
 '_jseq',
 '_lazy_rdd',
 '_repr_html_',
 '_sc',
 '_schema',
 '_sort_cols',
 '_support_repr_html',
 'agg',
 'alias',
 'approxQuantile',
 'cache',
 'checkpoint',
 'coalesce',
 'colRegex',
 'collect',
 'columns',
 'corr',
 'count',
 'cov',
 'createGlobalTempView',
 'createOrReplaceGlobalTempView',
 'createOrReplaceTempView',
 'createTempView',
 'crossJoin',
 'crosstab',
 'cube',
 'describe',
 'distinct',
 'drop',
 'dropDuplicates',
 'drop_duplicates',
 'dropna',
 'dtypes',
 'exceptAll',
 'explain',
 'fillna',
 'filter',
 'first',
 'foreach',
 'f

In [11]:
swimmersJSON.createOrReplaceTempView('swimmersJSON')

In [12]:
swimmersJSON.show()

+---+--------+----+-------+
|age|eyeColor|  id|   name|
+---+--------+----+-------+
| 19|   brown| 123|  Katie|
| 22|   green| 234|Michael|
| 23|    blue|1345| Simone|
+---+--------+----+-------+



In [13]:
spark.sql('select * from swimmersJSON').collect()

[Row(age=19, eyeColor='brown', id='123', name='Katie'),
 Row(age=22, eyeColor='green', id='234', name='Michael'),
 Row(age=23, eyeColor='blue', id='1345', name='Simone')]

In [14]:
spark.sql('select * from swimmersJSON where id="123"').collect()

[Row(age=19, eyeColor='brown', id='123', name='Katie')]

In [15]:
swimmersJSON.printSchema()

root
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



In [16]:
from pyspark.sql.types import *

In [17]:
stringCSVRDD = sc.parallelize([
    (123,'Katie',19,'brown'),
    (234,'Michael',22,'green'),
    (345,'Simone',23,'blue'),
])

In [18]:
schema = StructType([
    StructField('id',LongType(),True),
    StructField('name',StringType(),True),
    StructField('age',LongType(),True),
    StructField('eyeColor',StringType(),True)
])

In [19]:
swimmers = spark.createDataFrame(stringCSVRDD, schema)

## Querying with SQL

In [20]:
swimmers.createOrReplaceTempView('swimmers')

In [21]:
# # DataBricks notebook에서 지원하는 매직 키워드
# %%sql
# select * from swimmers

In [22]:
spark.sql('select * from swimmers').show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [23]:
spark.sql('select count(*) from swimmers').show()

+--------+
|count(1)|
+--------+
|       3|
+--------+



In [24]:
spark.sql('select id, age from swimmers').show()

+---+---+
| id|age|
+---+---+
|123| 19|
|234| 22|
|345| 23|
+---+---+



In [25]:
spark.sql('select id, age from swimmers where age=22').show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [26]:
spark.sql('select name,eyeColor from swimmers where eyeColor like "b%"').show()

+------+--------+
|  name|eyeColor|
+------+--------+
| Katie|   brown|
|Simone|    blue|
+------+--------+



Querying with the DataFrame API

In [27]:
display(swimmers)

DataFrame[id: bigint, name: string, age: bigint, eyeColor: string]

In [28]:
swimmers.count()

3

In [29]:
swimmers.select('id','age').show()

+---+---+
| id|age|
+---+---+
|123| 19|
|234| 22|
|345| 23|
+---+---+



In [30]:
swimmers.select('id','age').filter('age=22').show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



---

In [31]:
flightPerfFilePath = './departuredelays.csv'
airportsFilePath = './airport-codes-na.txt'

spark.read.csv()
- 1) 데이터 파일 경로
- 2) 첫 번째 줄이 헤더인지 여부
- 3) 스파크가리풀렉션으로 데이터 타입 추측 여부
- 4) 구분자 지정

In [32]:
airports = spark.read.csv(airportsFilePath, header='true', inferSchema='true', sep='\t')

In [33]:
airports.createOrReplaceTempView('airports')

In [34]:
flightPerf = spark.read.csv(flightPerfFilePath, header='true')

In [35]:
flightPerf.createOrReplaceTempView('FlightPerformance')

In [36]:
flightPerf.cache()

DataFrame[date: string, delay: string, distance: string, origin: string, destination: string]

In [37]:
spark.read.format('com.databricks.spark.csv').option('header','true').load(airportsFilePath)

DataFrame[City	State	Country	IATA: string]

In [38]:
spark.sql('select * from airports where State="WA"').show()
# 또는 airports.select("*").filter('State="WA"').show()

+-----------+-----+-------+----+
|       City|State|Country|IATA|
+-----------+-----+-------+----+
| Bellingham|   WA|    USA| BLI|
| Moses Lake|   WA|    USA| MWH|
|      Pasco|   WA|    USA| PSC|
|    Pullman|   WA|    USA| PUW|
|    Seattle|   WA|    USA| SEA|
|    Spokane|   WA|    USA| GEG|
|Walla Walla|   WA|    USA| ALW|
|  Wenatchee|   WA|    USA| EAT|
|     Yakima|   WA|    USA| YKM|
+-----------+-----+-------+----+



In [39]:
# 워싱턴에서 출발하는 비행기 중에서 지연이 된 경로 검색
spark.sql('select a.City, f.origin, sum(f.delay) as Delays \
            from FlightPerformance f \
            join airports a on a.IATA = f.origin \
            where a.State = "WA" \
            group by a.City, f.origin \
            order by sum(f.delay) desc').show()

+-------+------+--------+
|   City|origin|  Delays|
+-------+------+--------+
|Seattle|   SEA|159086.0|
|Spokane|   GEG| 12404.0|
|  Pasco|   PSC|   949.0|
+-------+------+--------+



In [40]:
# 미국의 주별 지연 정보 검색
spark.sql('select a.City, sum(f.delay) as Delays \
            from FlightPerformance f \
            join airports a on a.IATA = f.origin \
            where a.Country = "USA" \
            group by a.City, f.origin \
            order by sum(f.delay) desc').show()

+---------------+---------+
|           City|   Delays|
+---------------+---------+
|        Chicago|1193929.0|
|        Atlanta|1151087.0|
|         Denver| 899406.0|
|         Dallas| 679249.0|
|        Houston| 587978.0|
|    Los Angeles| 565490.0|
|  San Francisco| 501670.0|
|         Newark| 452791.0|
|        Orlando| 445070.0|
|      Las Vegas| 441797.0|
|        Chicago| 394254.0|
|       New York| 387929.0|
|        Phoenix| 364123.0|
|      Baltimore| 362845.0|
|        Detroit| 305426.0|
|       New York| 303888.0|
|Fort Lauderdale| 297358.0|
|  Washington DC| 260151.0|
|    Minneapolis| 250779.0|
|      Charlotte| 246558.0|
+---------------+---------+
only showing top 20 rows

