In [2]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import * 
from pyspark.sql.types import *

In [3]:
# Create a new SparkSession
spark = (SparkSession
         .builder
         .appName("optimize-data-shuffles")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate())

# Set log level to ERROR
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/23 11:21:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [33]:
# Create some sample data frames
# A large data frame with 1 million rows and two columns: id and value
large_df = (spark.range(0, 1000000)
            .withColumn("date", date_sub(current_date(), (rand() * 365).cast("int")))
            .withColumn("age", (rand() * 100).cast("int"))
            .withColumn("salary", 100*(rand() * 100).cast("int"))
            .withColumn("gender", when((rand() * 2).cast("int") == 0, "M").otherwise("F"))
            .withColumn("grade", 
                        when((rand() * 5).cast("int") == 0, "IC")
                        .when((rand() * 5).cast("int") == 1, "IC-2")
                        .when((rand() * 5).cast("int") == 2, "M1")
                        .when((rand() * 5).cast("int") == 3, "M2")
                        .when((rand() * 5).cast("int") == 4, "IC-3")
                        .otherwise("M3")))
large_df.show(5)

+---+----------+---+------+------+-----+
| id|      date|age|salary|gender|grade|
+---+----------+---+------+------+-----+
|  0|2022-12-01| 77|  9600|     F|   IC|
|  1|2023-08-17| 51|  7500|     F|   M3|
|  2|2023-02-10| 58|  6700|     M|   IC|
|  3|2023-05-08| 21|  7300|     M|   M2|
|  4|2023-01-14| 34|  5500|     M|   M3|
+---+----------+---+------+------+-----+
only showing top 5 rows



In [34]:
from pyspark.sql.functions import col, avg

# Filter the DataFrame by gender
df_filtered = large_df.filter(col("age") >= 55)

# Map the DataFrame by adding 10% bonus to salary
df_mapped = df_filtered.withColumn("bonus", col("salary") * 1.1)

# Locally aggregate the DataFrame by computing the average bonus by age
df_aggregated = df_mapped.groupBy("age").agg(avg("bonus"))

# Print the result
df_aggregated.show(5)

+---+-----------------+
|age|       avg(bonus)|
+---+-----------------+
| 85|5433.093191540399|
| 65|5425.068434303069|
| 78|5433.110714750945|
| 81|5443.704115016358|
| 76|  5445.9012274569|
+---+-----------------+
only showing top 5 rows



In [35]:
df_aggregated.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[age#542], functions=[avg(bonus#589)])
   +- Exchange hashpartitioning(age#542, 200), ENSURE_REQUIREMENTS, [plan_id=1003]
      +- HashAggregate(keys=[age#542], functions=[partial_avg(bonus#589)])
         +- Project [age#542, (cast(salary#546 as double) * 1.1) AS bonus#589]
            +- Filter (isnotnull(age#542) AND (age#542 >= 55))
               +- Project [age#542, (cast((rand(-2852223682338606353) * 100.0) as int) * 100) AS salary#546]
                  +- Project [cast((rand(-24633094416071200) * 100.0) as int) AS age#542]
                     +- Range (0, 10000000, step=1, splits=2)




In [36]:
from pyspark.sql.functions import broadcast

# Create another DataFrame with some dummy data
df2 = spark.createDataFrame([(25, "A"), (30, "B"), (35, "C"), (40, "D"), (45, "E"), (50, "F"), (55, "G"), (60, "H"), (65, "I"), (70, "J")], ["age", "level"])

# Join the two DataFrames by age using broadcast join
df_joined = large_df.join(broadcast(df2), "age")

# Globally aggregate the joined DataFrame by computing the sum of salary by level using partial aggregation
df_aggregated = df_joined.groupBy("level").avg("salary")

# Print the result
df_aggregated.show()

+-----+------------------+
|level|       avg(salary)|
+-----+------------------+
|    F| 4948.528276926469|
|    E| 4948.176937814321|
|    B| 4943.605257876781|
|    D|4957.5704561678685|
|    C| 4931.937575557754|
|    J| 4934.395221008479|
|    A|4958.7079856021855|
|    G| 4973.518919352253|
|    I| 4931.880394820972|
|    H| 4951.178290409542|
+-----+------------------+



In [37]:
df_aggregated.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[level#624], functions=[avg(salary#546)])
   +- Exchange hashpartitioning(level#624, 200), ENSURE_REQUIREMENTS, [plan_id=1191]
      +- HashAggregate(keys=[level#624], functions=[partial_avg(salary#546)])
         +- Project [salary#546, level#624]
            +- BroadcastHashJoin [cast(age#542 as bigint)], [age#623L], Inner, BuildRight, false
               :- Filter isnotnull(age#542)
               :  +- Project [age#542, (cast((rand(-2852223682338606353) * 100.0) as int) * 100) AS salary#546]
               :     +- Project [cast((rand(-24633094416071200) * 100.0) as int) AS age#542]
               :        +- Range (0, 10000000, step=1, splits=2)
               +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [plan_id=1186]
                  +- Filter isnotnull(age#623L)
                     +- Scan ExistingRDD[age#623L,level#624]




In [38]:
from pyspark.sql.functions import col

# Repartition the DataFrame by gender with 2 partitions
df_repartitioned = large_df.repartition(col("gender"))

# Repartition the DataFrame by age range with 5 partitions
df_repartitioned_by_range = large_df.repartitionByRange(5, col("age"))

2


[Stage 36:>                                                         (0 + 2) / 2]

2


[Stage 38:>                                                         (0 + 2) / 2]

5


In [39]:
large_df.explain()

== Physical Plan ==
*(1) Project [id#537L, date#539, age#542, salary#546, gender#551, CASE WHEN (cast((rand(7627123849445037294) * 5.0) as int) = 0) THEN IC WHEN (cast((rand(-3606855761943417541) * 5.0) as int) = 1) THEN IC-2 WHEN (cast((rand(2762691268524896822) * 5.0) as int) = 2) THEN M1 WHEN (cast((rand(-8349312454092533537) * 5.0) as int) = 3) THEN M2 WHEN (cast((rand(4327064802130063657) * 5.0) as int) = 4) THEN IC-3 ELSE M3 END AS grade#557]
+- *(1) Project [id#537L, date#539, age#542, salary#546, CASE WHEN (cast((rand(-1048537541828757866) * 2.0) as int) = 0) THEN M ELSE F END AS gender#551]
   +- *(1) Project [id#537L, date#539, age#542, (cast((rand(-2852223682338606353) * 100.0) as int) * 100) AS salary#546]
      +- *(1) Project [id#537L, date#539, cast((rand(-24633094416071200) * 100.0) as int) AS age#542]
         +- *(1) Project [id#537L, date_sub(2023-09-23, cast((rand(-4631883687744543562) * 365.0) as int)) AS date#539]
            +- *(1) Range (0, 10000000, step=1, sp

In [40]:
df_repartitioned.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   AQEShuffleRead coalesced
   +- ShuffleQueryStage 0
      +- Exchange hashpartitioning(gender#551, 200), REPARTITION_BY_COL, [plan_id=1259]
         +- *(1) Project [id#537L, date#539, age#542, salary#546, gender#551, CASE WHEN (cast((rand(7627123849445037294) * 5.0) as int) = 0) THEN IC WHEN (cast((rand(-3606855761943417541) * 5.0) as int) = 1) THEN IC-2 WHEN (cast((rand(2762691268524896822) * 5.0) as int) = 2) THEN M1 WHEN (cast((rand(-8349312454092533537) * 5.0) as int) = 3) THEN M2 WHEN (cast((rand(4327064802130063657) * 5.0) as int) = 4) THEN IC-3 ELSE M3 END AS grade#557]
            +- *(1) Project [id#537L, date#539, age#542, salary#546, CASE WHEN (cast((rand(-1048537541828757866) * 2.0) as int) = 0) THEN M ELSE F END AS gender#551]
               +- *(1) Project [id#537L, date#539, age#542, (cast((rand(-2852223682338606353) * 100.0) as int) * 100) AS salary#546]
                  +- *(1) Project [id#5

In [41]:
df_repartitioned_by_range.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   ShuffleQueryStage 0
   +- Exchange rangepartitioning(age#542 ASC NULLS FIRST, 5), REPARTITION_BY_NUM, [plan_id=1308]
      +- *(1) Project [id#537L, date#539, age#542, salary#546, gender#551, CASE WHEN (cast((rand(7627123849445037294) * 5.0) as int) = 0) THEN IC WHEN (cast((rand(-3606855761943417541) * 5.0) as int) = 1) THEN IC-2 WHEN (cast((rand(2762691268524896822) * 5.0) as int) = 2) THEN M1 WHEN (cast((rand(-8349312454092533537) * 5.0) as int) = 3) THEN M2 WHEN (cast((rand(4327064802130063657) * 5.0) as int) = 4) THEN IC-3 ELSE M3 END AS grade#557]
         +- *(1) Project [id#537L, date#539, age#542, salary#546, CASE WHEN (cast((rand(-1048537541828757866) * 2.0) as int) = 0) THEN M ELSE F END AS gender#551]
            +- *(1) Project [id#537L, date#539, age#542, (cast((rand(-2852223682338606353) * 100.0) as int) * 100) AS salary#546]
               +- *(1) Project [id#537L, date#539, cast((rand(-2463309

In [42]:
spark.stop()