In [1]:
import pyspark
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)

conf = pyspark.SparkConf().setAll([('spark.sql.optimizer.dynamicPartitionPruning.enabled', 'true')])
spark = SparkSession.builder.appName("partition_pruning").config(conf=conf).getOrCreate()



table_schema = t.StructType([
    t.StructField("date", t.StringType(), True),
    t.StructField("name", t.StringType(), True),
    t.StructField("region", t.IntegerType(), True),
    t.StructField("price", t.IntegerType(), True)])

csv_file_path = "file:///home/jovyan/work/sample/ecommerce_order.csv"
df = spark.read.schema(table_schema).csv(csv_file_path)

# write the file with the partition
df.write.partitionBy("region").mode("overwrite").parquet("/home/jovyan/work/output/partition_pruning")

# read the parquet file
read_df = spark.read.parquet("/home/jovyan/work/output/partition_pruning")
sales_total_df = read_df.where("region==2").agg(f.round(f.sum("price"),2).alias("sales"))

#sales_total_df.explain(mode="formatted")
#sales_total_df.show()

# read dimension table
csv_file_path = "file:///home/jovyan/work/sample/ecommerce_region.csv"
region_df = spark.read\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .csv(csv_file_path)

# before broadcasting
# joined_df = read_df.join(region_df,
#                read_df.region == region_df.region_id,
#                "inner")\
#         .where(region_df.city == "San Francisco")
# joined_df.show()

# after broadcasting
read_df = spark.read.parquet("/home/jovyan/work/output/partition_pruning")
joined_df = read_df.join(f.broadcast(region_df),
               read_df.region == region_df.region_id,
               "inner")\
        .where(region_df.city == "San Francisco")
joined_df.show()
joined_df.explain(mode="formatted")

+----------+----------------+-----+------+---------+-------------+
|      date|            name|price|region|region_id|         city|
+----------+----------------+-----+------+---------+-------------+
|2022-04-03|    Tory Delgado| 2158|     1|        1|San Francisco|
|2022-05-24|   Jene Franklin| 3643|     1|        1|San Francisco|
|2022-05-28|     Kasey Wolfe| 1236|     1|        1|San Francisco|
|2022-10-07|  Walton Kennedy| 2381|     1|        1|San Francisco|
|2022-10-06|Lakiesha Jimenez| 2629|     1|        1|San Francisco|
|2022-06-13| Piedad Williams| 1670|     1|        1|San Francisco|
|2022-12-13|    Elvina Grant| 2459|     1|        1|San Francisco|
|2022-10-27|   Cristie Stone| 3325|     1|        1|San Francisco|
|2022-07-03|     Lacy Flores| 1013|     1|        1|San Francisco|
|2022-01-01|   Kathey Little| 2293|     1|        1|San Francisco|
|2022-04-13|        Fe Reyes| 2438|     1|        1|San Francisco|
|2022-06-23|   Apryl Holland| 3003|     1|        1|San Franci