In [1]:
import os, sys
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['PYSPARK_PYTHON'] =  'python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.9'
os.environ['HADOOP_USER_NAME']='ssenigov'

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [None]:
conf = (SparkConf().setAppName('Bucketing_Part2').setMaster('yarn') 
    .set("spark.sql.autoBroadcastJoinThreshold", -1)\
    .set("spark.sql.warehouse.dir", 'hdfs:/users/ssenigov/spark_warehouse')\
    .set("spark.sql.adaptive.enabled", False)  )
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
print("app_id".ljust(40), spark.sparkContext.applicationId)

In [19]:
from pyspark.sql import Row
from faker import Faker

fake = Faker()
Faker.seed(0)

people = [ Row(fake.first_name(), fake.last_name()) for _ in range(10000)]

df = spark.sparkContext.parallelize(people, 2).toDF(["first_name", "last_name"])

In [22]:
# save to 500 buckets by first_name
df.write.bucketBy(500, 'first_name').mode("overwrite")\
  .saveAsTable('bucketed_people', path='hdfs:/users/ssenigov/spark_warehouse/bp') 

                                                                                

In [117]:
# query cases when we generated last name equal to somebody's first name 
df_filtered = spark.sql("""
 select distinct bp1.first_name, bp1.last_name, bp2.last_name
  from bucketed_people bp1
     join bucketed_people bp2 on bp1.first_name = bp2.last_name
   where bp1.first_name in ('Janny', 'Anna', 'John', 'Tom', 'James') """)
df_res = df_filtered
df_res.explain()

== Physical Plan ==
*(4) HashAggregate(keys=[first_name#780, last_name#781, last_name#783], functions=[])
+- *(4) HashAggregate(keys=[first_name#780, last_name#781, last_name#783], functions=[])
   +- *(4) SortMergeJoin [first_name#780], [last_name#783], Inner
      :- *(1) Sort [first_name#780 ASC NULLS FIRST], false, 0
      :  +- *(1) Filter (first_name#780 IN (Janny,Anna,John,Tom,James) AND isnotnull(first_name#780))
      :     +- *(1) ColumnarToRow
      :        +- FileScan parquet spark_catalog.default.bucketed_people[first_name#780,last_name#781] Batched: true, Bucketed: true, DataFilters: [first_name#780 IN (Janny,Anna,John,Tom,James), isnotnull(first_name#780)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[hdfs://adhcluster2.neoflex.ru:8020/users/ssenigov/spark_warehouse/bp500], PartitionFilters: [], PushedFilters: [In(first_name, [Anna,James,Janny,John,Tom]), IsNotNull(first_name)], ReadSchema: struct<first_name:string,last_name:string>, SelectedBucketsCount: 5 out

In [38]:
from pyspark.sql.functions import col, hash, collect_set, count, lit, desc, size

df.withColumn('hash', hash(col('first_name')))\
       .withColumn('bucket_num', col('hash')%500)\
       .groupBy('bucket_num').agg(\
           collect_set('first_name').alias('first_names')\
            )\
       .withColumn('names_count', size(col('first_names'))) \
       .orderBy(desc('names_count')).show(truncate=False)



+----------+----------------------------------------------------+-----------+
|bucket_num|first_names                                         |names_count|
+----------+----------------------------------------------------+-----------+
|386       |[Cassie, Gabrielle, Russell, Curtis, Adrian, Dalton]|6          |
|-442      |[Helen, George, Latoya, Gavin]                      |4          |
|120       |[Javier, Craig, Matthew, Nathan]                    |4          |
|-169      |[Diamond, Dawn, Tracy, Krystal]                     |4          |
|-111      |[Shelby, Karen, Peggy]                              |3          |
|420       |[Anita, Dan, Katrina]                               |3          |
|-437      |[Mason, Noah, Kari]                                 |3          |
|-447      |[Makayla, Cassandra, Cheyenne]                      |3          |
|272       |[Barbara, Jane, Tina]                               |3          |
|109       |[Amanda, Charlene, Meagan]                          

                                                                                

In [46]:
# query cases when we generated last name equal to somebody's first name 
df_filtered = spark.sql("""
 select distinct bp1.first_name, bp1.last_name, bp2.last_name
  from bucketed_people bp1
     join bucketed_people bp2 on bp1.first_name = bp2.last_name
   where bp1.first_name in -- ('George', 'Latoya', 'Gavin', 'Helen')
       ('Cassie', 'Gabrielle', 'Russell', 'Curtis', 'Adrian', 'Dalton') """)
df_res = df_filtered
df_res.explain()

== Physical Plan ==
*(4) HashAggregate(keys=[first_name#592, last_name#593, last_name#638], functions=[])
+- *(4) HashAggregate(keys=[first_name#592, last_name#593, last_name#638], functions=[])
   +- *(4) SortMergeJoin [first_name#592], [last_name#638], Inner
      :- *(1) Sort [first_name#592 ASC NULLS FIRST], false, 0
      :  +- *(1) Filter (first_name#592 IN (Cassie,Gabrielle,Russell,Curtis,Adrian,Dalton) AND isnotnull(first_name#592))
      :     +- *(1) ColumnarToRow
      :        +- FileScan parquet spark_catalog.default.bucketed_people[first_name#592,last_name#593] Batched: true, Bucketed: true, DataFilters: [first_name#592 IN (Cassie,Gabrielle,Russell,Curtis,Adrian,Dalton), isnotnull(first_name#592)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[hdfs://adhcluster2.neoflex.ru:8020/users/ssenigov/spark_warehouse/bp], PartitionFilters: [], PushedFilters: [In(first_name, [Adrian,Cassie,Curtis,Dalton,Gabrielle,Russell]), IsNotNull(first_name)], ReadSchema: struct<first_n

In [9]:
spark.stop()