In [1]:
from pyspark.sql import SparkSession

In [8]:
spark = SparkSession.builder.appName("ModernEmployeeData").getOrCreate()
df = spark.read.csv("modern_employee_data.csv", header=True, inferSchema=True)
df.show(5)
df.write.mode("overwrite").parquet("modern_employee_data.parquet")


+-----------+-------------------+------+---+-------+------------------+----------+------+------------+----------------+-----------------+--------------------+--------------+
|employee_id|               name|gender|age|country|             state|department|salary|joining_date|experience_years|performance_score|               email|  phone_number|
+-----------+-------------------+------+---+-------+------------------+----------+------+------------+----------------+-----------------+--------------------+--------------+
|          1|      Katrina Riley|  Male| 51|     UK|South Allisonmouth| Marketing| 97018|  2020-04-30|               5|             3.88|   becky87@gmail.com|  326-034-8112|
|          2|         Divij Raja| Other| 60|     UK|          Bhilwara| Marketing| 83808|  2016-05-25|               2|             4.57|lagan10@krishnan-...|   07045303968|
|          3|        Ivana Divan|  Male| 29|  India|         Rajasthan| Marketing|160911|  2019-10-24|              11|           

In [30]:
df.rdd.getNumPartitions()

1

In [21]:
df2 = df.repartition(10)
print(df2.rdd.getNumPartitions()) 

10


In [20]:
df2 = df.repartition(5, "department")
df2.rdd.getNumPartitions()

5

`repartition()` is costly because of full network shuffle —
data moves between executors.

Use only when:

- You need to increase partitions, or

- You need to repartition by a key column.

`coalesce()` – Reduce partitions (NO full shuffle)

In [29]:
df.rdd.__sizeof__()

16

In [96]:
# Minimum partition size when reading
spark.conf.set("spark.sql.files.maxPartitionBytes", "128MB")  # Default: 128MB

# Example:
# Total data: 10 GB
# maxPartitionBytes: 128MB
# Partitions created: 10GB / 128MB = ~80 partitions

In [97]:
df = spark.read.parquet("modern_employee_data.parquet", header=True, inferSchema=True)

In [98]:
df.rdd.getNumPartitions()

1

In [99]:
filtered = df
filtered.coalesce(10).write.parquet("output/")


In [112]:
df_rc = spark.read.csv("modern_employee_data.csv", header=True, inferSchema=True)

# Step 1: Increase partitions to improve parallelism
df_rc = df.repartition(12)

# Step 2: Process and filter data
processed = df_rc.filter(df_rc.experience_years > 18)

# Step 3: Reduce partitions before writing
processed.coalesce(4).write.parquet("expert_employees/")

In [113]:
df_rc.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange RoundRobinPartitioning(12), REPARTITION_BY_NUM, [plan_id=1112]
   +- FileScan parquet [employee_id#1113,name#1114,gender#1115,age#1116,country#1117,state#1118,department#1119,salary#1120,joining_date#1121,experience_years#1122,performance_score#1123,email#1124,phone_number#1125] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/home/developer/Workspace_Projects/Data_Engineer/PySpark/modern_e..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<employee_id:int,name:string,gender:string,age:int,country:string,state:string,department:s...




In [116]:
processed.show(5)

+-----------+-------------------+------+---+---------+---------------+----------+------+------------+----------------+-----------------+--------------------+------------------+
|employee_id|               name|gender|age|  country|          state|department|salary|joining_date|experience_years|performance_score|               email|      phone_number|
+-----------+-------------------+------+---+---------+---------------+----------+------+------------+----------------+-----------------+--------------------+------------------+
|        340|      Valerie Brown|Female| 45|       UK|       Chadside|     Sales| 45136|  2016-06-23|              20|             3.78| rburton@hotmail.com|  456.322.9746x061|
|        143|        Brandon Lee|  Male| 23|Australia|Port Jamesmouth|        IT|157344|  2016-03-27|              20|             3.22|     jryan@yahoo.com|  +44(0)1614960558|
|        138|      Elaine Savage|  Male| 36|       UK|   Matthewsberg|   Finance|244811|  2023-06-12|              

In [None]:
processed.rdd.getNumPartitions()

12