In [0]:
# Spark Session
from pyspark.sql import SparkSession
spark = (
        SparkSession
        .builder
        .appName("Distributed Shared Variables")
        .master("local[*]")
        .config("spark.executor.cores",4)
        .config("spark.cores.max",16)
        .config("spark.executor.memory", "512M")
        .getOrCreate()
)
spark


In [0]:
# Read EMP CSV file with 10 million records
_schema = "first_name string, last_name string, job_title string, dob date, email string, phone string, salary double, department string, department_id integer"
emp = spark.read.schema(_schema).option("header",True).csv("/data/input/datasets/employee_records.csv")

### Broadcast Variable

In [0]:
# Variable (lookup)
dept_names = {1: 'Department 1',
              2: 'Department 2',
              3: 'Department 3',
              4: 'Department 4',
              5: 'Department 5'
}
# Broadcast the variable
broadcast_dept_names = spark.sparkContext.broadcast(dept_names)
# Check the type of the variable
type(broadcast_dept_names)

Out[10]: pyspark.broadcast.Broadcast

In [0]:
# Check the value of the variable
broadcast_dept_names.value

Out[11]: {1: 'Department 1',
 2: 'Department 2',
 3: 'Department 3',
 4: 'Department 4',
 5: 'Department 5'}

- The below action (.show()) will be Processed in a Single Stage (No Shuffle involved) because we have distributed broadcast variable in each of the executors.
- It is being used below with the help of an UDF. We are not using any operation that is leading to Shuffle

In [0]:
# Create UDF to return Department Name
from pyspark.sql.functions import udf, col

@udf
def getDeptNames(dept_id):
    return broadcast_dept_names.value.get(dept_id)

emp_final = emp.withColumn("dept_name", getDeptNames(col("department_id")) )
emp_final.show(truncate=False, n=5)

+----------+---------+-----------------+----------+------------------------------+---------------+------------------+------------------+-------------+------------+
|first_name|last_name|job_title        |dob       |email                         |phone          |salary            |department        |department_id|dept_name   |
+----------+---------+-----------------+----------+------------------------------+---------------+------------------+------------------+-------------+------------+
|Jennifer  |Williams |HR Specialist    |1951-01-21|Jennifer.Williams.@example.com|+1-845-311-804 |42951.90537045701 |Finance           |6            |null        |
|James     |Miller   |Sales Executive  |1939-09-25|James.Miller.@example.com     |+1-274-633-7306|50933.8591162336  |Data and Analytics|6            |null        |
|Linda     |Jones    |Data Scientist   |2023-05-26|Linda.Jones.@example.com      |+1-149-733-8924|66274.49226944339 |Data and Analytics|2            |Department 2|
|Srishti   |Smit

### Accumulators

In [0]:
# Calculate total salary of Department 5
from pyspark.sql.functions import sum

emp.where("department_id=6").groupby("department_id").agg(sum("salary")).show()

+-------------+--------------------+
|department_id|         sum(salary)|
+-------------+--------------------+
|            6|1.248861142452698E11|
+-------------+--------------------+



The cell above has salary in exponential format, so we will cast it to long as shown below.

In [0]:
emp.where("department_id=6").agg(sum("salary").cast("long")).show()

+---------------------------+
|CAST(sum(salary) AS BIGINT)|
+---------------------------+
|               124886114245|
+---------------------------+



- In the above action, Exchange and Shuffling happens.(Check '*Spark UI'* for more info)
- So to eliminate Shuffling, we will now use another method: distributed variable *'Accumulators'*

In [0]:
# Accumulators
dept_sal = spark.sparkContext.accumulator(0)

# Use foreach
def calculateSalary(department_id, salary):
    if department_id == 6:
        dept_sal.add(salary)

emp.foreach(lambda row: calculateSalary(row.department_id, row.salary))


Out[30]: 124886114245.26982

In [0]:
# View total value
dept_sal.value

Out[31]: 124886114245.26982

- Now, if you check Spark UI for the above accumulator task, it will be processed in a single stage (No Shuffling involved).
- ALso, the total value is same as the one we ran without Accumulators (check cell 9)

> This was just a simple use case. You can use these two variables (Broadcast and Accumulators) for many other long list of use cases