In [1]:
import os, sys
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['PYSPARK_PYTHON'] =  'python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.9'
os.environ['HADOOP_USER_NAME']='ssenigov'

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [2]:
conf = (SparkConf().setAppName('Bucketing').setMaster('yarn') 
    .set("spark.sql.autoBroadcastJoinThreshold", -1)  )
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
print("app_id".ljust(40), spark.sparkContext.applicationId)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/03 15:09:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/03 15:09:59 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
25/01/03 15:09:59 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


app_id                                   application_1727681258360_0083


In [3]:
from faker import Faker
from collections import OrderedDict
from pyspark.sql import Row

# create two sample dataframes and fill with faker module
fake = Faker()
Faker.seed(0)
num1 = 10000
num2 =  5000

people1 = [ Row(fake.first_name(), fake.last_name()) for _ in range(num1)]
people2 = [ Row(fake.first_name(), fake.last_name() ) for _ in range(num2)]

df1 = spark.sparkContext.parallelize(people1, 4).toDF(["first_name", "last_name"])
df2 = spark.sparkContext.parallelize(people2, 4).toDF(["first_name", "last_name"])

                                                                                

In [4]:
# the idea of bucketing - saving data in a right way and then use it 
df1.write.bucketBy(50, 'first_name').mode("overwrite")\
  .saveAsTable('bucketed_people_1', path='hdfs:/users/ssenigov/spark_warehouse/bp1') ,

df2.write.bucketBy(50, 'first_name').mode("overwrite")\
  .saveAsTable('bucketed_people_2', path='hdfs:/users/ssenigov/spark_warehouse/bp2')

                                                                                

In [5]:
df_bucketed_people_1 = spark.read.table("bucketed_people_1")
df_bucketed_people_2 = spark.read.table("bucketed_people_2")

print(df_bucketed_people_1.count(), df_bucketed_people_2.count())



10000 5000


                                                                                

In [6]:
df_res = df_bucketed_people_1.join(df_bucketed_people_2, ['first_name'])
df_res.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [first_name#20, last_name#21, last_name#25]
   +- SortMergeJoin [first_name#20], [first_name#24], Inner
      :- Sort [first_name#20 ASC NULLS FIRST], false, 0
      :  +- Filter isnotnull(first_name#20)
      :     +- FileScan parquet spark_catalog.default.bucketed_people_1[first_name#20,last_name#21] Batched: true, Bucketed: true, DataFilters: [isnotnull(first_name#20)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[hdfs://adhcluster2.neoflex.ru:8020/users/ssenigov/spark_warehouse/bp1], PartitionFilters: [], PushedFilters: [IsNotNull(first_name)], ReadSchema: struct<first_name:string,last_name:string>, SelectedBucketsCount: 50 out of 50
      +- Sort [first_name#24 ASC NULLS FIRST], false, 0
         +- Filter isnotnull(first_name#24)
            +- FileScan parquet spark_catalog.default.bucketed_people_2[first_name#24,last_name#25] Batched: true, Bucketed: true, DataFilters: [isnotnull(first_name#24)], Format

In [7]:
from pyspark.sql.functions import col
df_res.select(\
              df_bucketed_people_1['first_name'].alias('first_name_1'),
              df_bucketed_people_1['last_name'].alias('last_name_1'),
              df_bucketed_people_2['first_name'].alias('first_name_2'),
              df_bucketed_people_2['last_name'].alias('last_name_2'),).show()

+------------+-----------+------------+-----------+
|first_name_1|last_name_1|first_name_2|last_name_2|
+------------+-----------+------------+-----------+
|     Felicia|    Roberts|     Felicia|       Neal|
|     Felicia|    Roberts|     Felicia|    Aguilar|
|     Felicia|    Roberts|     Felicia|       Wall|
|     Felicia|    Roberts|     Felicia|       Ward|
|     Felicia|    Roberts|     Felicia|   Anderson|
|     Felicia|    Roberts|     Felicia|      Wells|
|     Felicia|    Roberts|     Felicia|      Scott|
|     Felicia|       Dyer|     Felicia|       Neal|
|     Felicia|       Dyer|     Felicia|    Aguilar|
|     Felicia|       Dyer|     Felicia|       Wall|
|     Felicia|       Dyer|     Felicia|       Ward|
|     Felicia|       Dyer|     Felicia|   Anderson|
|     Felicia|       Dyer|     Felicia|      Wells|
|     Felicia|       Dyer|     Felicia|      Scott|
|     Felicia|    Santana|     Felicia|       Neal|
|     Felicia|    Santana|     Felicia|    Aguilar|
|     Felici

In [8]:
df_filtered = spark.sql("""
 select bucketed_people_1.first_name, bucketed_people_1.last_name
  from bucketed_people_1 
   join bucketed_people_2 on bucketed_people_1.first_name = bucketed_people_2.first_name
                          and bucketed_people_1.first_name in ('John', 'Mary') """ 
      )
df_filtered.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [first_name#20, last_name#21]
   +- SortMergeJoin [first_name#20], [first_name#24], Inner
      :- Sort [first_name#20 ASC NULLS FIRST], false, 0
      :  +- Filter (first_name#20 IN (John,Mary) AND isnotnull(first_name#20))
      :     +- FileScan parquet spark_catalog.default.bucketed_people_1[first_name#20,last_name#21] Batched: true, Bucketed: true, DataFilters: [first_name#20 IN (John,Mary), isnotnull(first_name#20)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[hdfs://adhcluster2.neoflex.ru:8020/users/ssenigov/spark_warehouse/bp1], PartitionFilters: [], PushedFilters: [In(first_name, [John,Mary]), IsNotNull(first_name)], ReadSchema: struct<first_name:string,last_name:string>, SelectedBucketsCount: 2 out of 50
      +- Sort [first_name#24 ASC NULLS FIRST], false, 0
         +- Filter (first_name#24 IN (John,Mary) AND isnotnull(first_name#24))
            +- FileScan parquet spark_catalog.default.bucketed_p

In [9]:
df_filtered.groupBy('first_name').count().show()

+----------+-----+
|first_name|count|
+----------+-----+
|      John|10950|
|      Mary| 2205|
+----------+-----+



In [10]:
#spark.stop()