In [3]:
# # Spark Session
# from pyspark.sql import SparkSession

# spark = (
#     SparkSession
#     .builder
#     .appName("Optimizing Shuffles")
#     .master("spark://172.20.16.1:7077")
#     .config("spark.cores.max", 16)
#     .config("spark.executor.cores", 4)
#     .config("spark.executor.memory", "512M")
#     .getOrCreate()
# )

# spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

spark = (
    SparkSession
    .builder
    .appName("Optimizing Shuffles")
    .master("local[*]")  # Use all available cores
    .config("spark.sql.shuffle.partitions", "16")  # Set this upfront
    .getOrCreate()
)

# print(f"Default Parallelism: {spark.sparkContext.defaultParallelism}")

# # Read EMP CSV file with 10M records
# _schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"
# emp = spark.read.format("csv").schema(_schema).option("header", True).load(r"C:\Users\vivek\Downloads\pyspark_lecture_tutorial\employee_records.csv")

# print(f"Initial partitions: {emp.rdd.getNumPartitions()}")

# # Find out avg salary as per dept
# emp_avg = emp.groupBy("department_id").agg(avg("salary").alias("avg_sal"))

# # Show results
# emp_avg.show()

# # Write data for performance Benchmarking
# emp_avg.write.format("noop").mode("overwrite").save()

# # Verify shuffle partition setting
# print(f"Shuffle partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")

# spark.stop()

In [2]:
# Check Spark defaultParallelism

# spark.sparkContext.defaultParallelism

print(f"Default Parallelism: {spark.sparkContext.defaultParallelism}")

Default Parallelism: 16


In [3]:
# Read EMP CSV file with 10M records

_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load(r"C:\Users\vivek\Downloads\pyspark_lecture_tutorial\employee_records.csv")

print(f"Initial partitions: {emp.rdd.getNumPartitions()}")

Initial partitions: 12


In [4]:
# Find out avg salary as per dept
# from pyspark.sql.functions import avg

emp_avg = emp.groupBy("department_id").agg(avg("salary").alias("avg_sal"))

In [6]:
# Show results
emp_avg.show()

+-------------+------------------+
|department_id|           avg_sal|
+-------------+------------------+
|         null|504759.24000329373|
+-------------+------------------+



In [5]:
# Write data for performance Benchmarking

emp_avg.write.format("noop").mode("overwrite").save()

In [9]:
# # Check Spark Shuffle Partition setting

# spark.conf.get("spark.sql.shuffle.partitions")

'16'

In [7]:
# Verify shuffle partition setting
print(f"Shuffle partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")

Shuffle partitions: 16


In [10]:
# spark.conf.set("spark.sql.shuffle.partitions", 100)

In [8]:
from pyspark.sql.functions import spark_partition_id

emp.withColumn("partition_id", spark_partition_id()).where("partition_id = 0").show()

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|partition_id|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|         null|           0|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|         null|           0|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|         null|           0|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|         null|           0|
|  Michelle|   Elliott|      Air cabin crew|1975-03-31|tiffany

In [14]:
# Read the partitioned data

# emp_part = spark.read.format("csv").schema(_schema).option("header", True).load("/data/input/emp_partitioned.csv/")

In [15]:
# emp_avg = emp_part.groupBy("department_id").agg(avg("salary").alias("avg_sal"))

In [13]:
# emp_avg.write.format("noop").mode("overwrite").save()