# BROADCAST()

In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("example-broadcast")
    .getOrCreate()
)

In [2]:
df1 = spark.createDataFrame([(1, "A"), (2, "B"), (3, "C")], ["A1", "A2"])
df2 = spark.createDataFrame([(1, "F"), (2, "B")], ["B1", "B2"])

In [15]:
from pyspark.sql.functions import broadcast

# Don't using broadcast
dfResult1 = df1.join(df2, df1.A1 == df2.B1, 'right_outer')
dfResult1.show()

# Using broadcast
dfResult2 = df1.join(broadcast(df2), df1.A1 == df2.B1)
dfResult2.show()

###### Errors

# Unsupported join type 'broadcast'
# df1.join(df2, df1.A1 == df2.B1, 'broadcast').show()

# 'DataFrame' object has no attribute 'merge'
# df1.merge(df2, df1.A1 == df2.B1, 'broadcast').show()

# Column is not iterable
# df1.join(df2, broadcast(df1.A1 == df2.B1)).show()

+---+---+---+---+
| A1| A2| B1| B2|
+---+---+---+---+
|  1|  A|  1|  F|
|  2|  B|  2|  B|
+---+---+---+---+

+---+---+---+---+
| A1| A2| B1| B2|
+---+---+---+---+
|  1|  A|  1|  F|
|  2|  B|  2|  B|
+---+---+---+---+



In [18]:
dfResult1.explain()
dfResult2.explain()

== Physical Plan ==
SortMergeJoin [A1#0L], [B1#4L], RightOuter
:- *(2) Sort [A1#0L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(A1#0L, 200), true, [id=#372]
:     +- *(1) Filter isnotnull(A1#0L)
:        +- *(1) Scan ExistingRDD[A1#0L,A2#1]
+- *(4) Sort [B1#4L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(B1#4L, 200), true, [id=#377]
      +- *(3) Scan ExistingRDD[B1#4L,B2#5]


== Physical Plan ==
*(2) BroadcastHashJoin [A1#0L], [B1#4L], Inner, BuildRight
:- *(2) Filter isnotnull(A1#0L)
:  +- *(2) Scan ExistingRDD[A1#0L,A2#1]
+- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false])), [id=#404]
   +- *(1) Filter isnotnull(B1#4L)
      +- *(1) Scan ExistingRDD[B1#4L,B2#5]




In [20]:
dfResult1.explain(True)

== Parsed Logical Plan ==
Join RightOuter, (A1#0L = B1#4L)
:- LogicalRDD [A1#0L, A2#1], false
+- LogicalRDD [B1#4L, B2#5], false

== Analyzed Logical Plan ==
A1: bigint, A2: string, B1: bigint, B2: string
Join RightOuter, (A1#0L = B1#4L)
:- LogicalRDD [A1#0L, A2#1], false
+- LogicalRDD [B1#4L, B2#5], false

== Optimized Logical Plan ==
Join RightOuter, (A1#0L = B1#4L)
:- Filter isnotnull(A1#0L)
:  +- LogicalRDD [A1#0L, A2#1], false
+- LogicalRDD [B1#4L, B2#5], false

== Physical Plan ==
SortMergeJoin [A1#0L], [B1#4L], RightOuter
:- *(2) Sort [A1#0L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(A1#0L, 200), true, [id=#372]
:     +- *(1) Filter isnotnull(A1#0L)
:        +- *(1) Scan ExistingRDD[A1#0L,A2#1]
+- *(4) Sort [B1#4L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(B1#4L, 200), true, [id=#377]
      +- *(3) Scan ExistingRDD[B1#4L,B2#5]



In [21]:
dfResult2.explain(True)

== Parsed Logical Plan ==
Join Inner, (A1#0L = B1#4L)
:- LogicalRDD [A1#0L, A2#1], false
+- ResolvedHint (strategy=broadcast)
   +- LogicalRDD [B1#4L, B2#5], false

== Analyzed Logical Plan ==
A1: bigint, A2: string, B1: bigint, B2: string
Join Inner, (A1#0L = B1#4L)
:- LogicalRDD [A1#0L, A2#1], false
+- ResolvedHint (strategy=broadcast)
   +- LogicalRDD [B1#4L, B2#5], false

== Optimized Logical Plan ==
Join Inner, (A1#0L = B1#4L), rightHint=(strategy=broadcast)
:- Filter isnotnull(A1#0L)
:  +- LogicalRDD [A1#0L, A2#1], false
+- Filter isnotnull(B1#4L)
   +- LogicalRDD [B1#4L, B2#5], false

== Physical Plan ==
*(2) BroadcastHashJoin [A1#0L], [B1#4L], Inner, BuildRight
:- *(2) Filter isnotnull(A1#0L)
:  +- *(2) Scan ExistingRDD[A1#0L,A2#1]
+- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false])), [id=#404]
   +- *(1) Filter isnotnull(B1#4L)
      +- *(1) Scan ExistingRDD[B1#4L,B2#5]



In [22]:
dfResult2.explain(mode="formatted")

== Physical Plan ==
* BroadcastHashJoin Inner BuildRight (6)
:- * Filter (2)
:  +- * Scan ExistingRDD (1)
+- BroadcastExchange (5)
   +- * Filter (4)
      +- * Scan ExistingRDD (3)


(1) Scan ExistingRDD [codegen id : 2]
Output [2]: [A1#0L, A2#1]
Arguments: [A1#0L, A2#1], MapPartitionsRDD[4] at applySchemaToPythonRDD at NativeMethodAccessorImpl.java:0, ExistingRDD, UnknownPartitioning(0)

(2) Filter [codegen id : 2]
Input [2]: [A1#0L, A2#1]
Condition : isnotnull(A1#0L)

(3) Scan ExistingRDD [codegen id : 1]
Output [2]: [B1#4L, B2#5]
Arguments: [B1#4L, B2#5], MapPartitionsRDD[9] at applySchemaToPythonRDD at NativeMethodAccessorImpl.java:0, ExistingRDD, UnknownPartitioning(0)

(4) Filter [codegen id : 1]
Input [2]: [B1#4L, B2#5]
Condition : isnotnull(B1#4L)

(5) BroadcastExchange
Input [2]: [B1#4L, B2#5]
Arguments: HashedRelationBroadcastMode(List(input[0, bigint, false])), [id=#404]

(6) BroadcastHashJoin [codegen id : 2]
Left keys [1]: [A1#0L]
Right keys [1]: [B1#4L]
Join condition: N

In [24]:
dfResult1.explain("cost")

== Optimized Logical Plan ==
Join RightOuter, (A1#0L = B1#4L), Statistics(sizeInBytes=8.51E+37 B)
:- Filter isnotnull(A1#0L), Statistics(sizeInBytes=8.0 EiB)
:  +- LogicalRDD [A1#0L, A2#1], false, Statistics(sizeInBytes=8.0 EiB)
+- LogicalRDD [B1#4L, B2#5], false, Statistics(sizeInBytes=8.0 EiB)

== Physical Plan ==
SortMergeJoin [A1#0L], [B1#4L], RightOuter
:- *(2) Sort [A1#0L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(A1#0L, 200), true, [id=#372]
:     +- *(1) Filter isnotnull(A1#0L)
:        +- *(1) Scan ExistingRDD[A1#0L,A2#1]
+- *(4) Sort [B1#4L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(B1#4L, 200), true, [id=#377]
      +- *(3) Scan ExistingRDD[B1#4L,B2#5]




In [23]:
dfResult2.explain("cost")

== Optimized Logical Plan ==
Join Inner, (A1#0L = B1#4L), rightHint=(strategy=broadcast), Statistics(sizeInBytes=8.51E+37 B)
:- Filter isnotnull(A1#0L), Statistics(sizeInBytes=8.0 EiB)
:  +- LogicalRDD [A1#0L, A2#1], false, Statistics(sizeInBytes=8.0 EiB)
+- Filter isnotnull(B1#4L), Statistics(sizeInBytes=8.0 EiB)
   +- LogicalRDD [B1#4L, B2#5], false, Statistics(sizeInBytes=8.0 EiB)

== Physical Plan ==
*(2) BroadcastHashJoin [A1#0L], [B1#4L], Inner, BuildRight
:- *(2) Filter isnotnull(A1#0L)
:  +- *(2) Scan ExistingRDD[A1#0L,A2#1]
+- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false])), [id=#404]
   +- *(1) Filter isnotnull(B1#4L)
      +- *(1) Scan ExistingRDD[B1#4L,B2#5]


