<a href="https://colab.research.google.com/github/Silvio-0-1/Python-Training/blob/main/16-12-2025/pyspark/broadcast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('partitioning').getOrCreate()

In [64]:
from pyspark.sql.functions import col, broadcast

In [65]:
orders_data = [
    ("O001","Hyderabad",1200),
    ("O002","Delhi",800),
    ("O003","Mumbai",1500),
    ("O004","Bangalore",400),
    ("O005","Hyderabad",300),
    ("O006","Delhi",2000),
    ("O007","Mumbai",700),
    ("O008","Bangalore",1800),
    ("O009","Delhi",350),
    ("O010","Hyderabad",900)
]

columns = ["order_id", "city", "amount"]

In [66]:
orders_df = spark.createDataFrame(data = orders_data, schema = columns)

orders_df.show()
orders_df.printSchema()

+--------+---------+------+
|order_id|     city|amount|
+--------+---------+------+
|    O001|Hyderabad|  1200|
|    O002|    Delhi|   800|
|    O003|   Mumbai|  1500|
|    O004|Bangalore|   400|
|    O005|Hyderabad|   300|
|    O006|    Delhi|  2000|
|    O007|   Mumbai|   700|
|    O008|Bangalore|  1800|
|    O009|    Delhi|   350|
|    O010|Hyderabad|   900|
+--------+---------+------+

root
 |-- order_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- amount: long (nullable = true)



In [67]:
city_data = [
    ("Hyderabad","Tier-1"),
    ("Delhi","Tier-1"),
    ("Mumbai","Tier-1"),
    ("Bangalore","Tier-1")
]

city_columns = ["city", "tier"]

In [68]:
city_df = spark.createDataFrame(data = city_data, schema = city_columns)

city_df.show()
city_df.printSchema()

+---------+------+
|     city|  tier|
+---------+------+
|Hyderabad|Tier-1|
|    Delhi|Tier-1|
|   Mumbai|Tier-1|
|Bangalore|Tier-1|
+---------+------+

root
 |-- city: string (nullable = true)
 |-- tier: string (nullable = true)



In [69]:
filtered_orders = orders_df.filter(col("amount") > 500)

joined_df = filtered_orders.join(
    city_df,
    on="city",
    how="inner"
)

final_df = joined_df.select(
    "order_id",
    "city",
    "tier",
    "amount"
)

In [70]:
final_df.explain(True) # Logical planning by Spark

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'tier, 'amount]
+- Project [city#194, order_id#193, amount#195L, tier#207]
   +- Join Inner, (city#194 = city#206)
      :- Filter (amount#195L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#193, city#194, amount#195L], false
      +- LogicalRDD [city#206, tier#207], false

== Analyzed Logical Plan ==
order_id: string, city: string, tier: string, amount: bigint
Project [order_id#193, city#194, tier#207, amount#195L]
+- Project [city#194, order_id#193, amount#195L, tier#207]
   +- Join Inner, (city#194 = city#206)
      :- Filter (amount#195L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#193, city#194, amount#195L], false
      +- LogicalRDD [city#206, tier#207], false

== Optimized Logical Plan ==
Project [order_id#193, city#194, tier#207, amount#195L]
+- Join Inner, (city#194 = city#206)
   :- Filter ((isnotnull(amount#195L) AND (amount#195L > 500)) AND isnotnull(city#194))
   :  +- LogicalRDD [order_id#193, city

In [71]:
broadcast_join_df = filtered_orders.join(
    broadcast(city_df),
	  on = "city",
	  how = "inner"
)

final_broadcast_df = broadcast_join_df.select(
	  "order_id",
	  "city",
	  "tier",
	  "amount"
)

In [72]:
final_broadcast_df.explain(True)

== Parsed Logical Plan ==
'Project ['order_id, 'city, 'tier, 'amount]
+- Project [city#194, order_id#193, amount#195L, tier#207]
   +- Join Inner, (city#194 = city#206)
      :- Filter (amount#195L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#193, city#194, amount#195L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#206, tier#207], false

== Analyzed Logical Plan ==
order_id: string, city: string, tier: string, amount: bigint
Project [order_id#193, city#194, tier#207, amount#195L]
+- Project [city#194, order_id#193, amount#195L, tier#207]
   +- Join Inner, (city#194 = city#206)
      :- Filter (amount#195L > cast(500 as bigint))
      :  +- LogicalRDD [order_id#193, city#194, amount#195L], false
      +- ResolvedHint (strategy=broadcast)
         +- LogicalRDD [city#206, tier#207], false

== Optimized Logical Plan ==
Project [order_id#193, city#194, tier#207, amount#195L]
+- Join Inner, (city#194 = city#206), rightHint=(strategy=broadcast)
   