In [1]:
import os, sys
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['PYSPARK_PYTHON'] =  'python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.9'
os.environ['HADOOP_USER_NAME']='ssenigov'

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [None]:
conf = (SparkConf().setAppName('Bucketing_Part1').setMaster('yarn') 
    .set("spark.sql.autoBroadcastJoinThreshold", -1)  
    .set("spark.sql.warehouse.dir", 'hdfs:/users/ssenigov/spark_warehouse') )
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
print("app_id".ljust(40), spark.sparkContext.applicationId)

In [3]:
from faker import Faker
from pyspark.sql import Row

# create data with faker module and make two sample dataframes with 10000 and 5000 rows 
fake = Faker()
Faker.seed(0)

people1 = [ Row(fake.first_name(), fake.last_name()) for _ in range(10000)]
people2 = [ Row(fake.first_name(), fake.last_name()) for _ in range(5000)]

df1 = spark.sparkContext.parallelize(people1, 2).toDF(["first_name", "last_name"])
df2 = spark.sparkContext.parallelize(people2, 2).toDF(["first_name", "last_name"])

                                                                                

In [4]:
df1.write.bucketBy(5, 'first_name').mode("overwrite")\
  .saveAsTable('bucketed_people_1', path='hdfs:/users/ssenigov/spark_warehouse/bp1') ,

df2.write.bucketBy(5, 'first_name').mode("overwrite")\
  .saveAsTable('bucketed_people_2', path='hdfs:/users/ssenigov/spark_warehouse/bp2')

                                                                                

In [5]:
df_bucketed_people_1 = spark.read.table("bucketed_people_1")
df_bucketed_people_2 = spark.read.table("bucketed_people_2")

In [6]:
df_res = df_bucketed_people_1.join(df_bucketed_people_2, ['first_name'])
df_res.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [first_name#20, last_name#21, last_name#25]
   +- SortMergeJoin [first_name#20], [first_name#24], Inner
      :- Sort [first_name#20 ASC NULLS FIRST], false, 0
      :  +- Filter isnotnull(first_name#20)
      :     +- FileScan parquet spark_catalog.default.bucketed_people_1[first_name#20,last_name#21] Batched: true, Bucketed: true, DataFilters: [isnotnull(first_name#20)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[hdfs://adhcluster2.neoflex.ru:8020/users/ssenigov/spark_warehouse/bp1], PartitionFilters: [], PushedFilters: [IsNotNull(first_name)], ReadSchema: struct<first_name:string,last_name:string>, SelectedBucketsCount: 5 out of 5
      +- Sort [first_name#24 ASC NULLS FIRST], false, 0
         +- Filter isnotnull(first_name#24)
            +- FileScan parquet spark_catalog.default.bucketed_people_2[first_name#24,last_name#25] Batched: true, Bucketed: true, DataFilters: [isnotnull(first_name#24)], Format: 

In [9]:
df_res.show()

[Stage 6:>                                                          (0 + 1) / 2]

+---------+----------+----------+
|last_name|first_name|first_name|
+---------+----------+----------+
|   Abbott|     Bryan|  Lawrence|
|   Abbott|     Bryan|    Andrea|
|   Abbott|  Cristina|  Lawrence|
|   Abbott|  Cristina|    Andrea|
|   Abbott|      Brad|  Lawrence|
|   Abbott|      Brad|    Andrea|
|   Abbott|       Lee|  Lawrence|
|   Abbott|       Lee|    Andrea|
|   Abbott|    Glenda|  Lawrence|
|   Abbott|    Glenda|    Andrea|
|  Acevedo|    Brooke|  Jennifer|
|  Acevedo|    Brooke|     Terri|
|  Acevedo|    Brooke|   Raymond|
|  Acevedo|      Mary|  Jennifer|
|  Acevedo|      Mary|     Terri|
|  Acevedo|      Mary|   Raymond|
|  Acevedo|    Nicole|  Jennifer|
|  Acevedo|    Nicole|     Terri|
|  Acevedo|    Nicole|   Raymond|
|  Acevedo|     Brian|  Jennifer|
+---------+----------+----------+
only showing top 20 rows



                                                                                

In [8]:
df_res = df_bucketed_people_1.join(df_bucketed_people_2, ['last_name'])
df_res.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [last_name#21, first_name#20, first_name#24]
   +- SortMergeJoin [last_name#21], [last_name#25], Inner
      :- Sort [last_name#21 ASC NULLS FIRST], false, 0
      :  +- Exchange hashpartitioning(last_name#21, 200), ENSURE_REQUIREMENTS, [plan_id=171]
      :     +- Filter isnotnull(last_name#21)
      :        +- FileScan parquet spark_catalog.default.bucketed_people_1[first_name#20,last_name#21] Batched: true, Bucketed: false (disabled by query planner), DataFilters: [isnotnull(last_name#21)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[hdfs://adhcluster2.neoflex.ru:8020/users/ssenigov/spark_warehouse/bp1], PartitionFilters: [], PushedFilters: [IsNotNull(last_name)], ReadSchema: struct<first_name:string,last_name:string>
      +- Sort [last_name#25 ASC NULLS FIRST], false, 0
         +- Exchange hashpartitioning(last_name#25, 200), ENSURE_REQUIREMENTS, [plan_id=175]
            +- Filter isnotnull(last_name#2