#### Bucketing physically organizes your data into fixed number of buckets (files) based on the hash of a column.

__Each bucket stores records with the same hash(column) % numBuckets.__

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BucketingExample").enableHiveSupport().getOrCreate()

employees = spark.read.csv("employees2.csv", header=True, inferSchema=True)
salaries = spark.read.csv("salaries.csv", header=True, inferSchema=True)


ModuleNotFoundError: No module named 'pyspark'

In [11]:
employees.write \
  .bucketBy(4, "department") \
  .sortBy("employee_id") \
  .saveAsTable("bucketed_employees")

salaries.write \
  .bucketBy(4, "department") \
  .sortBy("department") \
  .saveAsTable("bucketed_salaries")


Now Spark can skip shuffle automatically because both are bucketed identically.

In [13]:
employees_df = spark.read.table("bucketed_employees")
salaries_df = spark.read.table("bucketed_salaries")

joined = employees_df.join(salaries_df, "department", "inner")

joined.select("employee_id", "name", "department", "salary", "avg_salary", "bonus_percentage").show()



+-----------+-------+----------+------+----------+----------------+
|employee_id|   name|department|salary|avg_salary|bonus_percentage|
+-----------+-------+----------+------+----------+----------------+
|         10|   John|   Finance| 75000|     82000|               9|
|          7|  Grace|   Finance| 90000|     82000|               9|
|          8|  Helen|        HR| 52000|     60000|               8|
|          1|  Alice|        HR| 50000|     60000|               8|
|          6|  Frank|        IT| 80000|     75000|              12|
|          3|Charlie|        IT| 65000|     75000|              12|
|          2|    Bob|        IT| 60000|     75000|              12|
|          9|  Isaac|     Sales| 62000|     70000|              10|
|          5|    Eva|     Sales| 72000|     70000|              10|
|          4|  David|     Sales| 70000|     70000|              10|
+-----------+-------+----------+------+----------+----------------+

