In [0]:
# Spark Session
from pyspark.sql import SparkSession
spark = (
        SparkSession
        .builder
        .appName("Optimizing Skewness and Spillage")
        .master("local[*]")
        .config("spark.executor.cores",4)
        .config("spark.cores.max",8)
        .config("spark.executor.memory", "512M")
        .config("spark.sql.autoBroadcastJoinThreshold", -1)
        .config("spark.sql.adaptive.enabled", False)
        .config("spark.sql.adaptive.coalescePartitions.enabled", False)
        .getOrCreate()
)
spark


In [0]:
# Disable Adaptive Query Engine(AQE) and Broadcast Join
# spark.conf.set("spark.sql.adaptive.enabled", False)
# spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
# spark.conf.set("spark.sql.adaptive.autoBroadcastJoinThreshold", -1)

In [0]:
print(spark.conf.get("spark.sql.autoBroadcastJoinThreshold"))

-1


In [0]:
# Read EMP CSV file with 10 million records
emp_schema = "first_name string, last_name string, job_title string, dob date, email string, phone string, salary double, department string, department_id integer"
emp = spark.read.schema(emp_schema).option("header",True).csv("/data/input/datasets/employee_recs.csv")

In [0]:
# Read DEPT CSV file with 10 records
dept_schema ="department_id int, department_name string, description string, city string, state string, country string "
dept = spark.read.schema(dept_schema).option("header",True).csv("/data/input/datasets/department_recs.csv")

In [0]:
# JOINING datasets
dfjoined = emp.join(dept, on='department_id', how="left_outer")
dfjoined.write.format("noop").mode("overwrite").save()

In [0]:
dfjoined.explain()

== Physical Plan ==
*(2) Project [department_id#72, first_name#64, last_name#65, job_title#66, dob#67, email#68, phone#69, salary#70, department#71, department_name#83, description#84, city#85, state#86, country#87]
+- *(2) SortMergeJoin [department_id#72], [department_id#82], LeftOuter
   :- Sort [department_id#72 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(department_id#72, 200), ENSURE_REQUIREMENTS, [plan_id=98]
   :     +- FileScan csv [first_name#64,last_name#65,job_title#66,dob#67,email#68,phone#69,salary#70,department#71,department_id#72] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[dbfs:/data/input/datasets/employee_recs.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<first_name:string,last_name:string,job_title:string,dob:date,email:string,phone:string,sal...
   +- Sort [department_id#82 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(department_id#82, 200), ENSURE_REQUIREMENTS, [plan_id=13

- In Spark UI, if you check '*Shuffle read size/records*' for 200 tasks being called for Stage3(check cell 6 output), onle few of them has processed the data. Rest all have done nothing.
- So, the default shuffle partitions 200 is a waste for us. This is called skewness and might cause spillage in Memory and Disk in extreme cases and thus decreases performance.
- Let's verify it below. If you see it shows 10 partitions but our Shuffle Partitions are 200 i.e 190 partitions has no data (Check '*Shuffle read size/records*' in Spark UI) 
- This is why we need to configure our '*Shuffle Partitions*' properly.

In [0]:
# Check partition details to understand distribution
from pyspark.sql.functions import spark_partition_id, count, lit
dfpartitioned = dfjoined.withColumn("partitionnum", spark_partition_id()).groupBy("partitionnum").agg(count(lit(1)).alias("count"))
dfpartitioned.show()

+------------+-------+
|partitionnum|  count|
+------------+-------+
|         103|1000525|
|         122| 999664|
|          43| 999585|
|         107| 999690|
|          49| 998824|
|          51|1000469|
|         102|1000749|
|          66|1001702|
|         174| 999380|
|          89| 999412|
+------------+-------+



In [0]:
# Verify Employee data based on department_id

emp.groupBy("department_id").agg(count(lit(1))).show()
# We can see some minimal skewing because some department_id have data count slightly higher as shown below

+-------------+--------+
|department_id|count(1)|
+-------------+--------+
|            1|  999585|
|            6|  998824|
|            3| 1000469|
|            5| 1001702|
|            9|  999412|
|            4| 1000749|
|            8| 1000525|
|            7|  999690|
|           10|  999664|
|            2|  999380|
+-------------+--------+



> Even though the skewness here is minimal, in extreme cases how do we fix skewness?
- Since the 'joining column' itself is on department here, even if we repartition the data, the skewed departments will again come back to the same numbers and we will again have skewness for the data.
- Here comes '*Salting*' to the rescue. It will have data evenly distributed across the partitions and thus all of the task will be procesing data evenly without any Spillage. 




### SALTING

In [0]:
# Set Shuffle Partitions to a lesser number - 16
spark.conf.set("spark.sql.shuffle.partitions", 16)

In [0]:
# Let's prepare the salt
import random
from pyspark.sql.functions import udf

# UDF to create a random number query every time and add to employee as Salt 
@udf
def saltudf():
    return random.randint(0,16)
    
# Salt dataframe to add to department
saltdf = spark.range(0,16)
saltdf.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
+---+



In [0]:
# SALTED Employee

from pyspark.sql.functions import lit, concat
saltedemp = emp.withColumn("salted_deptId", concat("department_id",lit("_"), saltudf()))
saltedemp.show(truncate=False)

+-------------+------------+--------------+----------+-----------------------+------------+---------+---------------+-------------+-------------+
|first_name   |last_name   |job_title     |dob       |email                  |phone       |salary   |department     |department_id|salted_deptId|
+-------------+------------+--------------+----------+-----------------------+------------+---------+---------------+-------------+-------------+
|FirstName_280|LastName_280|Data Scientist|1982-02-27|user1249280@example.com|+13950857290|105255.84|Engineering    |3            |3_8          |
|FirstName_281|LastName_281|Team Lead     |1996-06-07|user1249281@example.com|+15818087627|39471.77 |Marketing      |5            |5_16         |
|FirstName_282|LastName_282|Data Analyst  |1996-09-02|user1249282@example.com|+18214812697|71801.39 |Data           |1            |1_5          |
|FirstName_283|LastName_283|HR Coordinator|1968-03-25|user1249283@example.com|+13544214736|110637.46|Support        |6      

In [0]:
salteddept = dept.join(saltdf, how="cross").withColumn("salted_deptId", concat("department_id", lit("_"), "id"))
salteddept.show()

+-------------+---------------+--------------------+-------------+-----+-------+---+-------------+
|department_id|department_name|         description|         city|state|country| id|salted_deptId|
+-------------+---------------+--------------------+-------------+-----+-------+---+-------------+
|            1|           Data|     Data Department|     New York|   NY|    USA|  0|          1_0|
|            1|           Data|     Data Department|     New York|   NY|    USA|  1|          1_1|
|            2|Human Resources|       HR Department|      Chicago|   IL|    USA|  0|          2_0|
|            2|Human Resources|       HR Department|      Chicago|   IL|    USA|  1|          2_1|
|            3|    Engineering|Engineering Depar...|San Francisco|   CA|    USA|  0|          3_0|
|            3|    Engineering|Engineering Depar...|San Francisco|   CA|    USA|  1|          3_1|
|            4|        Finance|  Finance Department|       Boston|   MA|    USA|  0|          4_0|
|         

In [0]:
salteddept.where("department_id=9").show()

+-------------+---------------+--------------------+-----+-----+-------+---+-------------+
|department_id|department_name|         description| city|state|country| id|salted_deptId|
+-------------+---------------+--------------------+-----+-----+-------+---+-------------+
|            9|     Operations|Operations Depart...|Miami|   FL|    USA|  0|          9_0|
|            9|     Operations|Operations Depart...|Miami|   FL|    USA|  1|          9_1|
|            9|     Operations|Operations Depart...|Miami|   FL|    USA|  2|          9_2|
|            9|     Operations|Operations Depart...|Miami|   FL|    USA|  3|          9_3|
|            9|     Operations|Operations Depart...|Miami|   FL|    USA|  4|          9_4|
|            9|     Operations|Operations Depart...|Miami|   FL|    USA|  5|          9_5|
|            9|     Operations|Operations Depart...|Miami|   FL|    USA|  6|          9_6|
|            9|     Operations|Operations Depart...|Miami|   FL|    USA|  7|          9_7|

if you see for all department_id's till 15(0-15 i.e. 16 rows) we have added SAlTING above.

In [0]:
# Let's make the SALTED join now
saltedjoineddf = saltedemp.join(salteddept, on='salted_deptId', how="left_outer")

In [0]:
from pyspark.sql.functions import spark_partition_id, count, lit
dfpartitioned = saltedjoineddf.withColumn("partitionnum", spark_partition_id()).groupBy("partitionnum").agg(count(lit(1)).alias("count"))
dfpartitioned.show()

+------------+------+
|partitionnum| count|
+------------+------+
|          12|354557|
|           5|705736|
|          10|588190|
|           1|352451|
|           3|588690|
|           2|824773|
|          13|706439|
|          14|471390|
|           6|645863|
|           9|824469|
|           7|881842|
|          11|939757|
|          15|470043|
|           4|469958|
|           0|587533|
|           8|588309|
+------------+------+



- Now, you can see above that the data has been distributed evenly across all the 16 partitions and there is no skewness.
- It Took only 16 tasks (16 Shuffle Partitions we configured in cell 13) to process the data as opposed to 200 when we didnt use Salting

> SUMMARY
- In '*Summary Metrics*' tab SPARK UI Under '*Stages*' you will see the 'Duration' to process the job will be significantly reduced after using 'SALTING' technique.