In [1]:
val myRange = spark.range(1000).toDF("number")
//建立單一欄位dataframe從數字0到999

myRange = [number: bigint]


[number: bigint]

In [2]:
val divisBy2 = myRange.where("number % 2 = 0")
//找所有偶數的轉換操作

divisBy2 = [number: bigint]


[number: bigint]

行動操作

In [3]:
divisBy2.count()

500

<h3>讀取資料</h3>

In [18]:
val flightData2015 = spark
  .read
  .option("inferSchema", "true")
  .option("header", "true")
  .csv("/home/jovyan/work/Spark-The-Definitive-Guide-master/data/flight-data/csv/2015-summary.csv")

flightData2015 = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [17]:
flightData2015.take(3)

Array([United States,Romania,15], [United States,Croatia,1], [United States,Ireland,344])

<b>Spark將會如何執行此操作(sort)</b>

In [14]:
flightData2015.sort("count").explain()

== Physical Plan ==
*(2) Sort [count#42 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#42 ASC NULLS FIRST, 200)
   +- *(1) FileScan csv [DEST_COUNTRY_NAME#40,ORIGIN_COUNTRY_NAME#41,count#42] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/work/Spark-The-Definitive-Guide-master/data/flight-data/csv/2..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>


In [20]:
spark.conf.set("spark.sql.shuffle.partitions", "5")//改變洗牌分區數量為5

In [21]:
flightData2015.sort("count").explain()

== Physical Plan ==
*(2) Sort [count#66 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#66 ASC NULLS FIRST, 5)
   +- *(1) FileScan csv [DEST_COUNTRY_NAME#64,ORIGIN_COUNTRY_NAME#65,count#66] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/work/Spark-The-Definitive-Guide-master/data/flight-data/csv/2..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>


<b>函數式程式設計概念:透過相同的輸入與操作得到相同結果</b>

In [22]:
flightData2015.sort("count").take(2)

Array([United States,Singapore,1], [Moldova,United States,1])

In [23]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [24]:
val sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

sqlWay = [DEST_COUNTRY_NAME: string, count(1): bigint]


[DEST_COUNTRY_NAME: string, count(1): bigint]

In [25]:
val dataFrameWay = flightData2015
  .groupBy('DEST_COUNTRY_NAME)
  .count()

dataFrameWay = [DEST_COUNTRY_NAME: string, count: bigint]


[DEST_COUNTRY_NAME: string, count: bigint]

In [26]:
sqlWay.explain

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#64], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#64, 5)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#64], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#64] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/work/Spark-The-Definitive-Guide-master/data/flight-data/csv/2..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [27]:
dataFrameWay.explain

== Physical Plan ==
*(2) HashAggregate(keys=[DEST_COUNTRY_NAME#64], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#64, 5)
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#64], functions=[partial_count(1)])
      +- *(1) FileScan csv [DEST_COUNTRY_NAME#64] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/work/Spark-The-Definitive-Guide-master/data/flight-data/csv/2..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [28]:
spark.sql("SELECT max(count) from flight_data_2015").take(1)

Array([370002])

In [29]:
import org.apache.spark.sql.functions.max

flightData2015.select(max("count")).take(1)

Array([370002])

In [30]:
val maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

maxSql = [DEST_COUNTRY_NAME: string, destination_total: bigint]


[DEST_COUNTRY_NAME: string, destination_total: bigint]

In [31]:
maxSql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [32]:
import org.apache.spark.sql.functions.desc

flightData2015
  .groupBy("DEST_COUNTRY_NAME")
  .sum("count")
  .withColumnRenamed("sum(count)", "destination_total")
  .sort(desc("destination_total"))
  .limit(5)
  .show()


+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [33]:
flightData2015
  .groupBy("DEST_COUNTRY_NAME")
  .sum("count")
  .withColumnRenamed("sum(count)", "destination_total")
  .sort(desc("destination_total"))
  .limit(5)
  .explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#153L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#64,destination_total#153L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#64], functions=[sum(cast(count#66 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#64, 5)
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#64], functions=[partial_sum(cast(count#66 as bigint))])
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#64,count#66] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/jovyan/work/Spark-The-Definitive-Guide-master/data/flight-data/csv/2..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>
