When most of your records share the same key (e.g. "India" or "NULL"), Spark puts all those rows into one partition → one executor gets overloaded → straggler tasks → slow jobs.

In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BucketingExample").enableHiveSupport().getOrCreate()

df = spark.read.csv("country.csv", header=True, inferSchema=True)
df.show()

+-------+-------+
|country|revenue|
+-------+-------+
|  India|    100|
|  India|    200|
|  India|    150|
|    USA|    400|
|     UK|    300|
+-------+-------+



In [None]:
df.groupBy("country").sum("revenue").show()

# Spark hashes the country column → almost all go to one partition for India.
# That’s data skew.

+-------+------------+
|country|sum(revenue)|
+-------+------------+
|  India|         450|
|    USA|         400|
|     UK|         300|
+-------+------------+



In [16]:
from pyspark.sql.functions import col, lit, rand

# Add a salt column (random integer between 0 and 9)
salted_df = df.withColumn("salt", (rand() * 10).cast("int"))


In [23]:
intermediate = salted_df.groupBy("country", "salt").sum("revenue").show()


+-------+----+------------+
|country|salt|sum(revenue)|
+-------+----+------------+
|    USA|   2|         400|
|  India|   8|         150|
|  India|   1|         200|
|     UK|   9|         300|
|  India|   4|         100|
+-------+----+------------+

