In [22]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Optimizing Skewness and Spillage")
    .master("local[*]")
    .config("spark.cores.max", 8)
    .config("spark.executor.cores", 4)
    .config("spark.executor.memory", "512M")
    .getOrCreate()
)

spark

In [23]:
# Disable AQE and Broadcast join

spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [24]:
# Read Employee data
_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load("data/input/employee_records.csv")

In [25]:
# Read DEPT CSV data
_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv").schema(_dept_schema).option("header", True).load("data/input/department_data.csv")


In [26]:
# Join Datasets

df_joined = emp.join(dept, on=emp.department_id==dept.department_id, how="left_outer")

In [27]:
df_joined.write.format("noop").mode("overwrite").save()

                                                                                

In [28]:
df_joined.explain()

== Physical Plan ==
*(4) SortMergeJoin [department_id#511], [department_id#520], LeftOuter
:- *(1) Sort [department_id#511 ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(department_id#511, 200), ENSURE_REQUIREMENTS, [plan_id=607]
:     +- FileScan csv [first_name#504,last_name#505,job_title#506,dob#507,email#508,phone#509,salary#510,department_id#511] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/vrushabh.deokar/pysparkBasics/data/input/employee_records...., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<first_name:string,last_name:string,job_title:string,dob:string,email:string,phone:string,s...
+- *(3) Sort [department_id#520 ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(department_id#520, 200), ENSURE_REQUIREMENTS, [plan_id=619]
      +- *(2) Filter isnotnull(department_id#520)
         +- FileScan csv [department_id#520,department_name#521,description#522,city#523,state#524,country#525] Batched: 

In [29]:
from pyspark.sql.functions import spark_partition_id, count, lit

part_df = df_joined.withColumn("partition_num", spark_partition_id()).groupBy("partition_num").agg(count(lit(1)).alias("count"))
part_df.show()

# Obersvation - Task Index (102, 103, 174) saw spillage on disk
# Task Index = partition_num
# Also from Default 200 Shuffled partition, 190 are idle 

                                                                                

+-------------+------+
|partition_num| count|
+-------------+------+
|          103|100417|
|          122| 99780|
|           43| 99451|
|          107| 99805|
|           49| 99706|
|           51|100248|
|          102|100214|
|           66|100210|
|          174|100155|
|           89|100014|
+-------------+------+



In [22]:
# Verify Employee data based on department_id

from pyspark.sql.functions import col, count, lit, desc

emp.groupBy("department_id").agg(count(lit(1)).alias("count")).show()

# Repartition will not work as department_id like 2, 3, 5 etc will again come to same partition 
# and hence Salting Technique comes into picture
# Approach - Add Salting to dept_id, do Shuffle and then remove Salting from dept_id

+-------------+------+
|department_id| count|
+-------------+------+
|            1| 99451|
|            6| 99706|
|            3|100248|
|            5|100210|
|            9|100014|
|            4|100214|
|            8|100417|
|            7| 99805|
|           10| 99780|
|            2|100155|
+-------------+------+



In [30]:
# Set shuffle partitions to a lesser number - 8

spark.conf.set("spark.sql.shuffle.partitions", 8)

In [31]:
# Let prepare the salt

import random
from pyspark.sql.functions import udf 

@udf 
def rand_udf():
    return random.randint(0,8)

salt_df = spark.range(0,8)
salt_df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
+---+



In [32]:
# Salted Employee
from pyspark.sql.functions import concat, lit

salted_emp = emp.withColumn("salted_dept_id", concat("department_id", lit("_"), rand_udf()))
salted_emp.show(truncate = False)

+----------+----------+----------------------------------+----------+----------------------------+---------------------+--------+-------------+--------------+
|first_name|last_name |job_title                         |dob       |email                       |phone                |salary  |department_id|salted_dept_id|
+----------+----------+----------------------------------+----------+----------------------------+---------------------+--------+-------------+--------------+
|Richard   |Morrison  |Public relations account executive|1973-05-05|melissagarcia@example.org   |(699)525-4827        |512653.0|8            |8_3           |
|Bobby     |Mccarthy  |Barrister's clerk                 |1974-04-25|llara@example.net           |(750)846-1602x7458   |999836.0|7            |7_4           |
|Dennis    |Norman    |Land/geomatics surveyor           |1990-06-24|jturner@example.net         |873.820.0518x825     |131900.0|10           |10_6          |
|John      |Monroe    |Retail buyer           

In [33]:
# Salted Department
salted_dept = dept.join(salt_df, how="cross").withColumn("salted_dept_id", concat("department_id", lit("_"), "id" ))
salted_dept.where("department_id == 1").show()

+-------------+---------------+--------------------+------------+-----+-------------------+---+--------------+
|department_id|department_name|         description|        city|state|            country| id|salted_dept_id|
+-------------+---------------+--------------------+------------+-----+-------------------+---+--------------+
|            1|    Bryan-James|Optimized disinte...|Melissaburgh|   FM|Trinidad and Tobago|  0|           1_0|
|            1|    Bryan-James|Optimized disinte...|Melissaburgh|   FM|Trinidad and Tobago|  1|           1_1|
|            1|    Bryan-James|Optimized disinte...|Melissaburgh|   FM|Trinidad and Tobago|  2|           1_2|
|            1|    Bryan-James|Optimized disinte...|Melissaburgh|   FM|Trinidad and Tobago|  3|           1_3|
|            1|    Bryan-James|Optimized disinte...|Melissaburgh|   FM|Trinidad and Tobago|  4|           1_4|
|            1|    Bryan-James|Optimized disinte...|Melissaburgh|   FM|Trinidad and Tobago|  5|           1_5|
|

In [34]:
# Lets make the salted join now

df_salted_joined = salted_emp.join(salted_dept, on=emp.department_id==dept.department_id, how="left_outer")

In [35]:
df_salted_joined.write.format("noop").mode("overwrite").save()

                                                                                

In [36]:
# Check the partition details to understand distribution

from pyspark.sql.functions import spark_partition_id, count, lit

part_df = salted_emp.withColumn("partition_num", spark_partition_id()).groupBy("partition_num").agg(count(lit(1)).alias("count"))
part_df.show()

+-------------+------+
|partition_num| count|
+-------------+------+
|            6|130312|
|            5|130403|
|            1|130406|
|            3|130393|
|            7| 87281|
|            2|130400|
|            4|130384|
|            0|130421|
+-------------+------+



In [37]:
spark.stop()

In [2]:
# What is Skewness? - Basically unbalanced data not distributed equally 
# Type of Spillage - 
# Spill Memory - deserialised, Spilled on Memory 
# Spill Disk - serialised, Spilled on Disk