In [1]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Optimizing Shuffles")
    .master("local[*]")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/12 18:05:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Check Spark defaultParallelism

spark.sparkContext.defaultParallelism

8

In [3]:
# Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [8]:
# Read EMP CSV file with 10M records

_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load("data/input/employee_records.csv")


In [18]:
# Find out avg salary as per dept

from pyspark.sql.functions import avg

emp_avg = emp.groupBy("department_id").agg(avg("salary")).alias("avg_sal")

In [19]:
# Write data for performance Benchmarking

emp_avg.write.format("noop").mode("overwrite").save()

                                                                                

In [20]:
# Check Spark Shuffle Partition setting
spark.conf.get("spark.sql.shuffle.partitions")

'200'

In [22]:
spark.conf.set("spark.sql.shuffle.partitions", 8)

# adjusting too many partitions results tasks busy with network and disk io
# adjusting to low partitions may results tasks OOM

In [24]:
from pyspark.sql.functions import spark_partition_id

emp.withColumn("partition_id", spark_partition_id()).where("partition_id = 0").show(1)

# each tasks(10 tasks for 10 department_id) run on each partitons(8) 
# resulting 10 records read for each partition

+----------+---------+--------------------+----------+--------------------+-------------+--------+-------------+------------+
|first_name|last_name|           job_title|       dob|               email|        phone|  salary|department_id|partition_id|
+----------+---------+--------------------+----------+--------------------+-------------+--------+-------------+------------+
|   Richard| Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|(699)525-4827|512653.0|            8|           0|
+----------+---------+--------------------+----------+--------------------+-------------+--------+-------------+------------+
only showing top 1 row



In [34]:
# Read the partitioned data

emp_part = spark.read.format("csv").schema(_schema).load("data/output/11/2/emp.parquet")


In [35]:
from pyspark.sql.functions import avg

emp_avg = emp_part.groupBy("department_id").agg(avg("salary")).alias("avg_sal")

In [36]:
# Write data for performance Benchmarking
# partitoned read reduced the shuffle write from 5.6 kb to 953 B 

emp_avg.write.format("noop").mode("overwrite").save()

In [38]:
# Good Shuffling practices
# 1. repartition data properly
# 2. filter out the data in early stages 
# 3. keep as much as less data for shuffling

spark.stop()