In [2]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Understand Plans and DAG")
    .master("local[*]")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/13 09:11:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [27]:
# Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [28]:
# Check default Parallism

spark.sparkContext.defaultParallelism

8

In [29]:
# Create dataframes
df_1 = spark.range(4, 200, 2)
df_2 = spark.range(2, 200, 4)

df_2.rdd.getNumPartitions()

8

In [30]:
# Re-partition data
df_3 = df_1.repartition(5)
df_4 = df_2.repartition(7)

df_4.rdd.getNumPartitions()

7

In [31]:
# Join the dataframes
# default partition for shuffle in spark is 200 and can adjust through config

df_joined = df_3.join(df_4, on="id")

In [32]:
# Get the sum of ids
# 6 stages and 229 tasks(running parallel)
# 229 tasks (parallel to each partition) - 8 + 8 + 5 + 7 + 200 + 1

df_sum = df_joined.selectExpr("sum(id) as sum_id")

df_sum.show()

+------+
|sum_id|
+------+
|  4998|
+------+



                                                                                

In [33]:
# Explain plan

df_sum.explain()

== Physical Plan ==
*(6) HashAggregate(keys=[], functions=[sum(id#47L)])
+- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=351]
   +- *(5) HashAggregate(keys=[], functions=[partial_sum(id#47L)])
      +- *(5) Project [id#47L]
         +- *(5) SortMergeJoin [id#47L], [id#49L], Inner
            :- *(2) Sort [id#47L ASC NULLS FIRST], false, 0
            :  +- Exchange hashpartitioning(id#47L, 200), ENSURE_REQUIREMENTS, [plan_id=335]
            :     +- Exchange RoundRobinPartitioning(5), REPARTITION_BY_NUM, [plan_id=334]
            :        +- *(1) Range (4, 200, step=2, splits=8)
            +- *(4) Sort [id#49L ASC NULLS FIRST], false, 0
               +- Exchange hashpartitioning(id#49L, 200), ENSURE_REQUIREMENTS, [plan_id=342]
                  +- Exchange RoundRobinPartitioning(7), REPARTITION_BY_NUM, [plan_id=341]
                     +- *(3) Range (2, 200, step=4, splits=8)




In [35]:
# Union the data again to see the skipped stages
# As a primary benefits of shuffle writes, earlier stages will be skipped 

df_union = df_sum.union(df_4)

df_union.show()

+------+
|sum_id|
+------+
|  4998|
|     2|
|    46|
|    58|
|    90|
|   102|
|   138|
|   158|
|   194|
|    18|
|    38|
|    74|
|   166|
|   174|
|    30|
|    66|
|    98|
|   122|
|   146|
|   162|
+------+
only showing top 20 rows



In [36]:
# Explain plan
df_union.explain()

== Physical Plan ==
Union
:- *(6) HashAggregate(keys=[], functions=[sum(id#47L)])
:  +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=589]
:     +- *(5) HashAggregate(keys=[], functions=[partial_sum(id#47L)])
:        +- *(5) Project [id#47L]
:           +- *(5) SortMergeJoin [id#47L], [id#49L], Inner
:              :- *(2) Sort [id#47L ASC NULLS FIRST], false, 0
:              :  +- Exchange hashpartitioning(id#47L, 200), ENSURE_REQUIREMENTS, [plan_id=573]
:              :     +- Exchange RoundRobinPartitioning(5), REPARTITION_BY_NUM, [plan_id=572]
:              :        +- *(1) Range (4, 200, step=2, splits=8)
:              +- *(4) Sort [id#49L ASC NULLS FIRST], false, 0
:                 +- Exchange hashpartitioning(id#49L, 200), ENSURE_REQUIREMENTS, [plan_id=580]
:                    +- Exchange RoundRobinPartitioning(7), REPARTITION_BY_NUM, [plan_id=579]
:                       +- *(3) Range (2, 200, step=4, splits=8)
+- ReusedExchange [id#70L], Exchange RoundRobinPart

In [37]:
# DataFrame to RDD
# Use RDD, When you have to distribute the data physically with help of user-code 
# or if we need to work with spark-core API's extensively otherwise it is never recommended

df_1.rdd

MapPartitionsRDD[70] at javaToPython at NativeMethodAccessorImpl.java:0

In [3]:
# Spark pipelining concept implies spark will try to pack as 
# much as transformation into single stage and whenever it encounters
# shuffle or exchange it will create another stage
spark.stop()