In [0]:
spark.conf.set("spark.sql.adaptive.enabled", "false")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [0]:
from pyspark.sql import functions as F, types as T

In [0]:
df1 = (
    spark
    .range(1, 100000, 1, 10)
    .select(
        F.col("id").cast(T.IntegerType()),
        F.sha2(F.col("id").cast(T.StringType()), 256).alias("hash"),
        (F.rand(seed=123) * 90).alias("sale"),
        (F.col("id") * F.rand(seed=123)).alias("exposure"),
    )
)
df1.display()

In [0]:
df1.rdd.getNumPartitions()

In [0]:
df2 = (
    spark
    .range(1, 100000, 2, 10)
    .select(
        F.col("id").cast(T.IntegerType()),
        F.sha2(F.col("id").cast(T.StringType()), 256).alias("hash"),
        (F.rand(seed=123) * 90).alias("sale"),
        (F.col("id") * F.rand(seed=123)).alias("exposure"),
    )
)
df2.display()

In [0]:
df2.rdd.getNumPartitions()

In [0]:
df = (
    df1.alias("a")
    .join(
        df2.alias('b'),
        on=["id"],
        how="inner",
    )
).select(
    F.col("a.id"),
    F.col("a.hash"),
    F.col("a.sale"),
    F.col("a.exposure")
)
df.display()

In [0]:
df1.rdd.getNumPartitions()

In [0]:
df1.write.format("parquet").bucketBy(10, "hash").sortBy("hash").mode("overwrite").saveAsTable("bucket_table1")
df2.write.format("parquet").bucketBy(10, "hash").sortBy("hash").mode("overwrite").saveAsTable("bucket_table2")

In [0]:
%fs
ls "/user/hive/warehouse/bucket_table1"

In [0]:
# join bucketed tables

bucketed_df1 = spark.read.format("parquet").load("dbfs:/user/hive/warehouse/bucket_table1")
bucketed_df2 = spark.read.format("parquet").load("dbfs:/user/hive/warehouse/bucket_table2")

In [0]:
bucketed_df1.show()

In [0]:
bucketed_df1 = bucketed_df1.repartition(10, "hash").sortWithinPartitions("hash")
bucketed_df2 = bucketed_df2.repartition(10, "hash").sortWithinPartitions("hash")

In [0]:
bucketed_df1.rdd.getNumPartitions()

In [0]:
bucketed_df2.display()

In [0]:
# join bucket tables

bucket_df = (
    bucketed_df1
    .join(
        bucketed_df2,
        on=["id"],
        how="inner",
    )
).select(
    bucketed_df1.id,
    bucketed_df1.hash,
    bucketed_df1.sale,
    bucketed_df1.exposure
)

bucket_df.display()

In [0]:
df = (
    df1
    .join(
        bucketed_df2,
        on=["id"],
        how="inner"
    )
).display()