In [0]:
# Spark Session
from pyspark.sql import SparkSession
spark = (
        SparkSession
        .builder
        .appName("DAG plan understanding")
        .getOrCreate()
)
spark


In [0]:
# Disable Adaptive Query Engine(AQE) and Broadcast Join
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.adaptive.autoBroadcastJoinThreshold", -1)

In [0]:
# Check the default Parallelism
spark.sparkContext.defaultParallelism

Out[4]: 8

In [0]:
# Create dataframes
df1 = spark.range(4, 200, 2)
df1.rdd.getNumPartitions() 
df2 = spark.range(2, 200, 4)
df2.rdd.getNumPartitions()

# Default partition is '8'

Out[22]: 8

In [0]:
# Re-partition data
df3 = df1.repartition(5)
df4 = df2.repartition(7)

In [0]:
df3.rdd.getNumPartitions()

Out[18]: 5

In [0]:
df4.rdd.getNumPartitions()

Out[19]: 7

In [0]:
# Join the dataframes
df_joined = df3.join(df4, on='id')

# Here by default 200 shuffle partitions are created

In [0]:
# Get the sum of id's
from pyspark.sql.functions import expr
df_sum = df_joined.selectExpr("sum(id) AS total_sum")
df_sum.show()

# Here those 200 partitions are being pushed into 1 partition by default in order to do the SUM (last stage)

+---------+
|total_sum|
+---------+
|     4998|
+---------+



In [0]:
# Explain plan
df_sum.explain()

== Physical Plan ==
*(4) HashAggregate(keys=[], functions=[finalmerge_sum(merge sum#424L) AS sum(id#408L)#418L])
+- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=586]
   +- *(3) HashAggregate(keys=[], functions=[partial_sum(id#408L) AS sum#424L])
      +- *(3) Project [id#408L]
         +- *(3) BroadcastHashJoin [id#408L], [id#410L], Inner, BuildRight, false, false
            :- Exchange RoundRobinPartitioning(5), REPARTITION_BY_NUM, [plan_id=576]
            :  +- *(1) Range (4, 200, step=2, splits=8)
            +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=580]
               +- Exchange RoundRobinPartitioning(7), REPARTITION_BY_NUM, [plan_id=579]
                  +- *(2) Range (2, 200, step=4, splits=8)




In [0]:
# UNION the data again to see SKipped Stages
df_union = df_sum.union(df4)
# df_union.show() # After doing show() the skipped stages will be displayed in the Spark UI


- In the above command, the Spark will skip the stages for 'df_sum' since it was already processed before.
- It will just create stages to perform a UNION here.
- Check 'Spark UI' for more info.

In [0]:
df_union.explain()

== Physical Plan ==
Union
:- *(4) HashAggregate(keys=[], functions=[finalmerge_sum(merge sum#424L) AS sum(id#408L)#418L])
:  +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=780]
:     +- *(3) HashAggregate(keys=[], functions=[partial_sum(id#408L) AS sum#424L])
:        +- *(3) Project [id#408L]
:           +- *(3) BroadcastHashJoin [id#408L], [id#410L], Inner, BuildRight, false, false
:              :- Exchange RoundRobinPartitioning(5), REPARTITION_BY_NUM, [plan_id=770]
:              :  +- *(1) Range (4, 200, step=2, splits=8)
:              +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=774]
:                 +- Exchange RoundRobinPartitioning(7), REPARTITION_BY_NUM, [plan_id=773]
:                    +- *(2) Range (2, 200, step=4, splits=8)
+- ReusedExchange [id#486L], Exchange RoundRobinPartitioning(7), REPARTITION_BY_NUM, [plan_id=773]




In [0]:
# Dataframe to RDD
df1.rdd

Out[27]: MapPartitionsRDD[193] at javaToPython at <unknown>:0

- > NOTE: RDD's are recommended to use ONLY when we require the data to be distributed physically with the help of the code or we have to work extensively with Spark Core API's. 
- In other cases, we ALWAYS go with Dataframes