**_Objetivo:_** Neste notebook, serão consolidados códigos para explorações práticas envolvendo o contéudo presente no capítulo 8 do livro Spark - The Definitive Guide: Joins. No cenário proposto, exemplos de união lateral de dados serão explorados de modo a propor transformações envolvendo dois ou mais datasets.

In [1]:
# Importando bibliotecas
from pyspark.sql import SparkSession
import os

# Definindo variáveis de diretório
DATA_PATH = '../book-github-resources/'

# Criando sessão
spark = SparkSession.builder.getOrCreate()
spark

# Preparando Bases

In [2]:
# Criando DataFrame de pessoas
person = spark.createDataFrame([
    (0, "Bill Chambers", 0, [100]),
    (1, "Matei Zaharia", 1, [500, 250, 100]),
    (2, "Michael Armbrust", 1, [250, 100])
]).toDF("id", "name", "grad_program", "spark_status")

# Criando DataFrame de programas de grauação
gradProgram = spark.createDataFrame([
    (0, "Masters", "School of Information", "UC Berkeley"),
    (2, "Masters", "EECS", "UC Berkeley"),
    (1, "Ph.D", "EECS", "UC Berkeley")
]).toDF("id", "degree", "department", "school")

# Criando DataFrame de status Spark
sparkStatus = spark.createDataFrame([
    (500, "Vice President"),
    (250, "PMC Member"),
    (100, "Contributor")
]).toDF("id", "status")

# Registrando views
person.createOrReplaceTempView("person")
gradProgram.createOrReplaceTempView("grad_program")
sparkStatus.createOrReplaceTempView("spark_status")

In [3]:
# Visualisando bases
person.show()
gradProgram.show()
sparkStatus.show()

+---+----------------+------------+---------------+
| id|            name|grad_program|   spark_status|
+---+----------------+------------+---------------+
|  0|   Bill Chambers|           0|          [100]|
|  1|   Matei Zaharia|           1|[500, 250, 100]|
|  2|Michael Armbrust|           1|     [250, 100]|
+---+----------------+------------+---------------+

+---+-------+--------------------+-----------+
| id| degree|          department|     school|
+---+-------+--------------------+-----------+
|  0|Masters|School of Informa...|UC Berkeley|
|  2|Masters|                EECS|UC Berkeley|
|  1|   Ph.D|                EECS|UC Berkeley|
+---+-------+--------------------+-----------+

+---+--------------+
| id|        status|
+---+--------------+
|500|Vice President|
|250|    PMC Member|
|100|   Contributor|
+---+--------------+



In [4]:
# Criando expressão join
join_expr = (person['grad_program'] == gradProgram['id'])

# Aplicando join
person.join(gradProgram, join_expr).show()

+---+----------------+------------+---------------+---+-------+--------------------+-----------+
| id|            name|grad_program|   spark_status| id| degree|          department|     school|
+---+----------------+------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|           0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|           1|[500, 250, 100]|  1|   Ph.D|                EECS|UC Berkeley|
|  2|Michael Armbrust|           1|     [250, 100]|  1|   Ph.D|                EECS|UC Berkeley|
+---+----------------+------------+---------------+---+-------+--------------------+-----------+



In [5]:
# Aplicando joins com argumentos explícitos
person.join(other=gradProgram, on=join_expr, how="inner").show()

+---+----------------+------------+---------------+---+-------+--------------------+-----------+
| id|            name|grad_program|   spark_status| id| degree|          department|     school|
+---+----------------+------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|           0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|           1|[500, 250, 100]|  1|   Ph.D|                EECS|UC Berkeley|
|  2|Michael Armbrust|           1|     [250, 100]|  1|   Ph.D|                EECS|UC Berkeley|
+---+----------------+------------+---------------+---+-------+--------------------+-----------+



In [6]:
# Em SparkSQL
spark.sql("""
    SELECT
        *
    FROM person
    INNER JOIN grad_program
        ON person.grad_program = grad_program.id
""").show()

+---+----------------+------------+---------------+---+-------+--------------------+-----------+
| id|            name|grad_program|   spark_status| id| degree|          department|     school|
+---+----------------+------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|           0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|           1|[500, 250, 100]|  1|   Ph.D|                EECS|UC Berkeley|
|  2|Michael Armbrust|           1|     [250, 100]|  1|   Ph.D|                EECS|UC Berkeley|
+---+----------------+------------+---------------+---+-------+--------------------+-----------+



# Left Join

In [7]:
# Aplicando join
gradProgram.join(person, join_expr, how='left_outer').show()

+---+-------+--------------------+-----------+----+----------------+------------+---------------+
| id| degree|          department|     school|  id|            name|grad_program|   spark_status|
+---+-------+--------------------+-----------+----+----------------+------------+---------------+
|  0|Masters|School of Informa...|UC Berkeley|   0|   Bill Chambers|           0|          [100]|
|  1|   Ph.D|                EECS|UC Berkeley|   1|   Matei Zaharia|           1|[500, 250, 100]|
|  1|   Ph.D|                EECS|UC Berkeley|   2|Michael Armbrust|           1|     [250, 100]|
|  2|Masters|                EECS|UC Berkeley|null|            null|        null|           null|
+---+-------+--------------------+-----------+----+----------------+------------+---------------+



# Right Join

In [8]:
# Aplicando join
person.join(gradProgram, join_expr, how="right_outer").show()

+----+----------------+------------+---------------+---+-------+--------------------+-----------+
|  id|            name|grad_program|   spark_status| id| degree|          department|     school|
+----+----------------+------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|           0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|           1|[500, 250, 100]|  1|   Ph.D|                EECS|UC Berkeley|
|   2|Michael Armbrust|           1|     [250, 100]|  1|   Ph.D|                EECS|UC Berkeley|
|null|            null|        null|           null|  2|Masters|                EECS|UC Berkeley|
+----+----------------+------------+---------------+---+-------+--------------------+-----------+



# Left Semi e Left Anti

In [9]:
# Aplicando left semi join
gradProgram.join(person, join_expr, how="left_semi").show()

# Aplicando left anti join
gradProgram.join(person, join_expr, how="left_anti").show()

+---+-------+--------------------+-----------+
| id| degree|          department|     school|
+---+-------+--------------------+-----------+
|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Ph.D|                EECS|UC Berkeley|
+---+-------+--------------------+-----------+

+---+-------+----------+-----------+
| id| degree|department|     school|
+---+-------+----------+-----------+
|  2|Masters|      EECS|UC Berkeley|
+---+-------+----------+-----------+



# Join em Colunas Complexas

In [23]:
# Importando função
from pyspark.sql.functions import split

# Definindo diretório de base alternativa
ALT_PATH = '../book-github-resources/Spark-The-Definitive-Guide-master/data/retail-data/by-day/2010-12-01.csv'

# Lendo base de dados
df = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(ALT_PATH)

# Criando DataFrame para uso com arrays
df_array = df.select(split("Description", " ").alias("split_desc"))

In [34]:
# Importando função
from pyspark.sql.functions import array_contains, col

df_array.select(
    "split_desc",
    array_contains("split_desc", "WHITE").alias("flag_white")
).show(5, truncate=False)

+------------------------------------------+----------+
|split_desc                                |flag_white|
+------------------------------------------+----------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]  |true      |
|[WHITE, METAL, LANTERN]                   |true      |
|[CREAM, CUPID, HEARTS, COAT, HANGER]      |false     |
|[KNITTED, UNION, FLAG, HOT, WATER, BOTTLE]|false     |
|[RED, WOOLLY, HOTTIE, WHITE, HEART.]      |true      |
+------------------------------------------+----------+
only showing top 5 rows



In [32]:
# Importando funções
person.join(
    other=sparkStatus,
    on=array_contains(person["spark_status"], sparkStatus["id"])
).show()

+---+----------------+------------+---------------+---+--------------+
| id|            name|grad_program|   spark_status| id|        status|
+---+----------------+------------+---------------+---+--------------+
|  0|   Bill Chambers|           0|          [100]|100|   Contributor|
|  1|   Matei Zaharia|           1|[500, 250, 100]|500|Vice President|
|  1|   Matei Zaharia|           1|[500, 250, 100]|250|    PMC Member|
|  1|   Matei Zaharia|           1|[500, 250, 100]|100|   Contributor|
|  2|Michael Armbrust|           1|     [250, 100]|250|    PMC Member|
|  2|Michael Armbrust|           1|     [250, 100]|100|   Contributor|
+---+----------------+------------+---------------+---+--------------+



In [36]:
# Renomeando coluna antes de aplicar join
person.withColumnRenamed("id", "person_id")\
    .join(sparkStatus, on=expr("array_contains(spark_status, id)"))\
    .show()

+---------+----------------+------------+---------------+---+--------------+
|person_id|            name|grad_program|   spark_status| id|        status|
+---------+----------------+------------+---------------+---+--------------+
|        0|   Bill Chambers|           0|          [100]|100|   Contributor|
|        1|   Matei Zaharia|           1|[500, 250, 100]|500|Vice President|
|        1|   Matei Zaharia|           1|[500, 250, 100]|250|    PMC Member|
|        1|   Matei Zaharia|           1|[500, 250, 100]|100|   Contributor|
|        2|Michael Armbrust|           1|     [250, 100]|250|    PMC Member|
|        2|Michael Armbrust|           1|     [250, 100]|100|   Contributor|
+---------+----------------+------------+---------------+---+--------------+



In [38]:
# Realizando consulta análoga em SparkSQL
spark.sql("""
    SELECT * FROM person AS p
    INNER JOIN spark_status AS s
        ON array_contains(p.spark_status, s.id)
""").show()

+---+----------------+------------+---------------+---+--------------+
| id|            name|grad_program|   spark_status| id|        status|
+---+----------------+------------+---------------+---+--------------+
|  0|   Bill Chambers|           0|          [100]|100|   Contributor|
|  1|   Matei Zaharia|           1|[500, 250, 100]|500|Vice President|
|  1|   Matei Zaharia|           1|[500, 250, 100]|250|    PMC Member|
|  1|   Matei Zaharia|           1|[500, 250, 100]|100|   Contributor|
|  2|Michael Armbrust|           1|     [250, 100]|250|    PMC Member|
|  2|Michael Armbrust|           1|     [250, 100]|100|   Contributor|
+---+----------------+------------+---------------+---+--------------+



In [40]:
gradProgram.join(person, join_expr, how="left_semi").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [_1#16L AS id#24L, _2#17 AS degree#25, _3#18 AS department#26, _4#19 AS school#27]
   +- SortMergeJoin [_1#16L], [grad_program#10L], LeftSemi
      :- Sort [_1#16L ASC NULLS FIRST], false, 0
      :  +- Exchange hashpartitioning(_1#16L, 200), ENSURE_REQUIREMENTS, [id=#1501]
      :     +- Filter isnotnull(_1#16L)
      :        +- Scan ExistingRDD[_1#16L,_2#17,_3#18,_4#19]
      +- Sort [grad_program#10L ASC NULLS FIRST], false, 0
         +- Exchange hashpartitioning(grad_program#10L, 200), ENSURE_REQUIREMENTS, [id=#1502]
            +- Project [_3#2L AS grad_program#10L]
               +- Filter isnotnull(_3#2L)
                  +- Scan ExistingRDD[_1#0L,_2#1,_3#2L,_4#3]


