**_Objetivo:_** Neste notebook, serão consolidados códigos para explorações práticas envolvendo o contéudo presente no capítulo 6 do livro Spark - The Definitive Guide: Working with Different Types of Data. No cenário proposto, exemplos de operações com diferentes tipos primitivos são mostrados, desde booleanos, até tipos complexos. Além disso, também serão mostrados exemplos de utilização de funções definidas pelo usuário (UDFs) para transformações adicionais.

In [1]:
# Importando bibliotecas
from pyspark.sql import SparkSession
import os

# Definindo variáveis de diretório
DATA_PATH = '../book-github-resources/Spark-The-Definitive-Guide-master/data/retail-data/by-day/2010-12-01.csv'

# Inicializando sessão
spark = SparkSession.builder.getOrCreate()
spark

In [2]:
# Lendo base de dados 
df = spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load(DATA_PATH)

# Criando view
df.createOrReplaceTempView('vw_retail_data')

# Verificando dados e schema
df.show(2)
df.printSchema()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable

In [3]:
# Importando funções
from pyspark.sql.functions import col, expr

# Trabalhando com Booleanos

In [4]:
# Filtrando dados com referência de colunas
df.where(col("InvoiceNo") != 536365)\
    .select("InvoiceNo", "Description")\
    .show(2, truncate=False)

+---------+-------------------------+
|InvoiceNo|Description              |
+---------+-------------------------+
|536366   |HAND WARMER UNION JACK   |
|536366   |HAND WARMER RED POLKA DOT|
+---------+-------------------------+
only showing top 2 rows



In [5]:
# Filtrando dados direto com expressões
df.where("InvoiceNo <> 536365")\
    .select("InvoiceNo", "Description")\
    .show(2, truncate=False)

+---------+-------------------------+
|InvoiceNo|Description              |
+---------+-------------------------+
|536366   |HAND WARMER UNION JACK   |
|536366   |HAND WARMER RED POLKA DOT|
+---------+-------------------------+
only showing top 2 rows



In [6]:
expr("InvoiceNo <> 536365")

Column<'(NOT (InvoiceNo = 536365))'>

In [7]:
col("InvoiceNo") != 536365

Column<'(NOT (InvoiceNo = 536365))'>

In [8]:
# Importando função instr
from pyspark.sql.functions import instr

# Criação de filtros
stockFilter = col("StockCode").contains("DOT")
priceFilter = expr("UnitPrice") > 600
descFilter = instr("Description", "POSTAGE") >= 1

# Lista de seleção de colunas
selectList = ["InvoiceNo", "StockCode", "Description", "UnitPrice"]

# Aplicando filtros
df.where(stockFilter)\
    .where(priceFilter | descFilter)\
    .select(selectList)\
    .show()

+---------+---------+--------------+---------+
|InvoiceNo|StockCode|   Description|UnitPrice|
+---------+---------+--------------+---------+
|   536544|      DOT|DOTCOM POSTAGE|   569.77|
|   536592|      DOT|DOTCOM POSTAGE|   607.49|
+---------+---------+--------------+---------+



In [9]:
# Análogo em SQL
spark.sql("""
    SELECT InvoiceNo, StockCode, Description, UnitPrice
    FROM vw_retail_data
    WHERE StockCode IN ("DOT")
        AND (UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1)
""").show()

+---------+---------+--------------+---------+
|InvoiceNo|StockCode|   Description|UnitPrice|
+---------+---------+--------------+---------+
|   536544|      DOT|DOTCOM POSTAGE|   569.77|
|   536592|      DOT|DOTCOM POSTAGE|   607.49|
+---------+---------+--------------+---------+



In [10]:
# Filtrando dados a partir de coluna booleana
df.withColumn('isExpensive', stockFilter & (priceFilter | descFilter))\
    .where('isExpensive')\
    .select(selectList + ['isExpensive'])\
    .show()

+---------+---------+--------------+---------+-----------+
|InvoiceNo|StockCode|   Description|UnitPrice|isExpensive|
+---------+---------+--------------+---------+-----------+
|   536544|      DOT|DOTCOM POSTAGE|   569.77|       true|
|   536592|      DOT|DOTCOM POSTAGE|   607.49|       true|
+---------+---------+--------------+---------+-----------+



# Trabalhando com Numéricos

In [11]:
# Importando funções
from pyspark.sql.functions import expr, pow, round

# Criando novo campo com base em fórmula (jeito 1)
fabricatedQuantity = round(pow(col("Quantity") * col("UnitPrice"), 2) + 5, 2)
df.select(
    "Quantity", "UnitPrice", 
    fabricatedQuantity.alias("realQuantity")
).show(3)

+--------+---------+------------+
|Quantity|UnitPrice|realQuantity|
+--------+---------+------------+
|       6|     2.55|      239.09|
|       6|     3.39|      418.72|
|       8|     2.75|       489.0|
+--------+---------+------------+
only showing top 3 rows



In [12]:
# Criando novo campo com base em fórmula (jeito 2)
df.selectExpr(
    "Quantity", "UnitPrice", 
    "round(power((Quantity * UnitPrice), 2) + 5, 2) AS realQuantity"
).show(3)

+--------+---------+------------+
|Quantity|UnitPrice|realQuantity|
+--------+---------+------------+
|       6|     2.55|      239.09|
|       6|     3.39|      418.72|
|       8|     2.75|       489.0|
+--------+---------+------------+
only showing top 3 rows



In [13]:
# Importando função
from pyspark.sql.functions import corr

# Correlação entre variáveis via select
df.select(corr("Quantity", "UnitPrice").alias("PriceQtyCorr")).show()

# Correlação entre variáveis via selectExpr
df.selectExpr("corr(Quantity, UnitPrice) AS PriceQtyCorr").show()

+--------------------+
|        PriceQtyCorr|
+--------------------+
|-0.04112314436835551|
+--------------------+

+--------------------+
|        PriceQtyCorr|
+--------------------+
|-0.04112314436835551|
+--------------------+



In [14]:
# Coletando colunas numéricas do DataFrame
num_cols = [s.name for s in df.schema if s.dataType.typeName() != 'string']

# Computando estatísticas de colunas numéricas
df.select(num_cols).describe().show()

+-------+------------------+------------------+------------------+
|summary|          Quantity|         UnitPrice|        CustomerID|
+-------+------------------+------------------+------------------+
|  count|              3108|              3108|              1968|
|   mean| 8.627413127413128| 4.151946589446603|15661.388719512195|
| stddev|26.371821677029203|15.638659854603892|1854.4496996893627|
|    min|               -24|               0.0|           12431.0|
|    max|               600|            607.49|           18229.0|
+-------+------------------+------------------+------------------+



In [15]:
# Importando funções estatísticas
from pyspark.sql.functions import count, mean, stddev_pop, min, max

# Calculando estatísticas na unha
function_list = [count, mean, stddev_pop, min, max]
stats_list = [func(col).alias(col + '_' + func.__name__) for func in function_list for col in num_cols]

# Realizando consulta (mostrando apenas as colunas de count e mean - espaço)
df.select(stats_list[:6]).show()

+--------------+---------------+----------------+-----------------+-----------------+------------------+
|Quantity_count|UnitPrice_count|CustomerID_count|    Quantity_mean|   UnitPrice_mean|   CustomerID_mean|
+--------------+---------------+----------------+-----------------+-----------------+------------------+
|          3108|           3108|            1968|8.627413127413128|4.151946589446603|15661.388719512195|
+--------------+---------------+----------------+-----------------+-----------------+------------------+



In [16]:
# Aplicando crosstab
df.stat.freqItems(["StockCode"]).take(1)

[Row(StockCode_freqItems=['90214E', '20728', '20755', '21703', '22113', '22524', '22041', '72803A', '72798C', '90181B', '21756', '22694', '90206C', '20970', '21624', '90209C', '84744', '82494L', '22952', '20682', '22583', '21705', '20679', '22220', '90177E', '90214A', '22448', '90214S', '22121', '22802', '84970L', '72818', '90192', '90200C', '22910', '21380', '90211A', '21137', '35271S', '84926A', '20765', '22384', '21524', '22165', '22366', '21221', '21704', '22519', '85035C', '21967', '22114', '22909', '22900', '22447', '21577', '21877', '20726', '85034A', 'DOT', '84658', '21472', '22804', '22222', '72802C', '21739', '22467', '90214H', '22785', '22446', '22197', '20665', '21733', '22731', '21709', '22086', '40001', '85123A'])]

# Trabalhando com Strings

In [17]:
# Importando função
from pyspark.sql.functions import initcap

# Aplicando title case
df.select(
    "Description", 
    initcap("Description").alias("TitledDescription")
).show(3, truncate=False)

+----------------------------------+----------------------------------+
|Description                       |TitledDescription                 |
+----------------------------------+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|White Hanging Heart T-light Holder|
|WHITE METAL LANTERN               |White Metal Lantern               |
|CREAM CUPID HEARTS COAT HANGER    |Cream Cupid Hearts Coat Hanger    |
+----------------------------------+----------------------------------+
only showing top 3 rows



In [18]:
# Importando funções
from pyspark.sql.functions import lower, upper

# Alterando case 
df.select(
    "Description",
    lower("Description").alias("LowerDescription"),
    upper(lower("Description")).alias("UpperDescription")
).show(3, truncate=False)

+----------------------------------+----------------------------------+----------------------------------+
|Description                       |LowerDescription                  |UpperDescription                  |
+----------------------------------+----------------------------------+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|white hanging heart t-light holder|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |white metal lantern               |WHITE METAL LANTERN               |
|CREAM CUPID HEARTS COAT HANGER    |cream cupid hearts coat hanger    |CREAM CUPID HEARTS COAT HANGER    |
+----------------------------------+----------------------------------+----------------------------------+
only showing top 3 rows



In [19]:
# Realizando via SQL
spark.sql("""
    SELECT
        Description AS original,
        initcap(Description) AS titled,
        lower(Description) AS lower,
        upper(lower(Description)) AS upper
    
    FROM vw_retail_data
""").show(3)

+--------------------+--------------------+--------------------+--------------------+
|            original|              titled|               lower|               upper|
+--------------------+--------------------+--------------------+--------------------+
|WHITE HANGING HEA...|White Hanging Hea...|white hanging hea...|WHITE HANGING HEA...|
| WHITE METAL LANTERN| White Metal Lantern| white metal lantern| WHITE METAL LANTERN|
|CREAM CUPID HEART...|Cream Cupid Heart...|cream cupid heart...|CREAM CUPID HEART...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [20]:
# Importando funções
from pyspark.sql.functions import lit, ltrim, rtrim, trim, lpad, rpad

# Exemplificando tratativas em string
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 2, " ").alias("lpad"),
    rpad(lit("HELLO"), 10, " ").alias("rpad")
).show(2)

+---------+---------+-----+----+----------+
|    ltrim|    rtrim| trim|lpad|      rpad|
+---------+---------+-----+----+----------+
|HELLO    |    HELLO|HELLO|  HE|HELLO     |
|HELLO    |    HELLO|HELLO|  HE|HELLO     |
+---------+---------+-----+----+----------+
only showing top 2 rows



In [21]:
# Importando funções
from pyspark.sql.functions import regexp_extract, regexp_replace

# Definindo RegEx
pattern = "(BLACK|WHITE|RED|GREEN|BLUE)"

# Construindo consulta
df.select(
    "Description",
    regexp_replace(col("Description"), pattern, "COLOR").alias("color_clean"),
    regexp_extract(col("Description"), pattern, 1).alias("first_color")
).show(3, truncate=False)

+----------------------------------+----------------------------------+-----------+
|Description                       |color_clean                       |first_color|
+----------------------------------+----------------------------------+-----------+
|WHITE HANGING HEART T-LIGHT HOLDER|COLOR HANGING HEART T-LIGHT HOLDER|WHITE      |
|WHITE METAL LANTERN               |COLOR METAL LANTERN               |WHITE      |
|CREAM CUPID HEARTS COAT HANGER    |CREAM CUPID HEARTS COAT HANGER    |           |
+----------------------------------+----------------------------------+-----------+
only showing top 3 rows



In [22]:
# Importando funções
from pyspark.sql.functions import instr

# Criando filtros
black_color_filter = col("Description").contains("BLACK")
white_color_filter = instr(col("Description"), "WHITE") >= 1

# Aplicando consulta
df.select("Description")\
    .where(black_color_filter & white_color_filter)\
    .show(3, truncate=False)

+---------------------------------+
|Description                      |
+---------------------------------+
|JUMBO  BAG BAROQUE BLACK WHITE   |
|WOOD BLACK BOARD ANT WHITE FINISH|
|JUMBO  BAG BAROQUE BLACK WHITE   |
+---------------------------------+
only showing top 3 rows



In [23]:
# Importando funções
from pyspark.sql.functions import locate

# Definindo lista de cores
find_list = ["black", "white", "red", "green", "blue"]
func_list = [locate(f.upper(), col("Description")).cast("boolean").alias(f"is_{f}") for f in find_list]

# Aplicando select
df.select("Description", *func_list).show(5, truncate=False)

+-----------------------------------+--------+--------+------+--------+-------+
|Description                        |is_black|is_white|is_red|is_green|is_blue|
+-----------------------------------+--------+--------+------+--------+-------+
|WHITE HANGING HEART T-LIGHT HOLDER |false   |true    |false |false   |false  |
|WHITE METAL LANTERN                |false   |true    |false |false   |false  |
|CREAM CUPID HEARTS COAT HANGER     |false   |false   |false |false   |false  |
|KNITTED UNION FLAG HOT WATER BOTTLE|false   |false   |false |false   |false  |
|RED WOOLLY HOTTIE WHITE HEART.     |false   |true    |true  |false   |false  |
+-----------------------------------+--------+--------+------+--------+-------+
only showing top 5 rows



# Trabalhando com Dates e Timestamps

In [24]:
# Importando funções
from pyspark.sql.functions import current_date, current_timestamp

# Gerando datas e horas
df_date = spark.range(1)\
    .withColumn("today", current_date())\
    .withColumn("now", current_timestamp())
df_date.show(truncate=False)

# Criando view
df_date.createOrReplaceTempView("tbl_date")

# Verificando schema
df_date.printSchema()

+---+----------+-----------------------+
|id |today     |now                    |
+---+----------+-----------------------+
|0  |2022-02-24|2022-02-24 18:37:20.516|
+---+----------+-----------------------+

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [25]:
# Importando funções
from pyspark.sql.functions import date_add, date_sub

# Adicionando datas
df_date.select(
    "today",
    date_add("today", 5).alias("today_plus_5"),
    date_sub("today", 5).alias("today_minus_5")
).show()

# Em SQL
spark.sql("""
    SELECT
        today,
        date_add(today, 5) AS today_plus_5,
        date_sub(today, 5) AS today_minus_5
    FROM tbl_date
""").show()

+----------+------------+-------------+
|     today|today_plus_5|today_minus_5|
+----------+------------+-------------+
|2022-02-24|  2022-03-01|   2022-02-19|
+----------+------------+-------------+

+----------+------------+-------------+
|     today|today_plus_5|today_minus_5|
+----------+------------+-------------+
|2022-02-24|  2022-03-01|   2022-02-19|
+----------+------------+-------------+



In [26]:
# Importando funções
from pyspark.sql.functions import datediff, months_between

# Aplicando consulta
df_date.withColumn("week_ago", date_sub("today", 7))\
    .select(
        "today",
        "week_ago",
        datediff("today", "week_ago").alias("datediff")
).show()

+----------+----------+--------+
|     today|  week_ago|datediff|
+----------+----------+--------+
|2022-02-24|2022-02-17|       7|
+----------+----------+--------+



In [27]:
# Importando funções
from pyspark.sql.functions import to_date

# Calculando diferença em meses entre duas datas
df_date.select(
    to_date(lit("2022-02-22")).alias("start"),
    to_date(lit("2022-12-05")).alias("end")
).select(
    months_between("start", "end").alias("months_between")
).show()

+--------------+
|months_between|
+--------------+
|    -9.4516129|
+--------------+



In [28]:
# Importando funções
from pyspark.sql.functions import to_date, to_timestamp

# Conversão de datas
spark.range(1)\
    .withColumn("date_default", to_date(lit("02-02-2022")))\
    .withColumn("date_formatted", to_date(lit("02-02-2022"), "dd-MM-yyyy"))\
    .withColumn("timestamp_formatted", to_timestamp(lit("02-02-2022"), "dd-MM-yyyy"))\
.show()

+---+------------+--------------+-------------------+
| id|date_default|date_formatted|timestamp_formatted|
+---+------------+--------------+-------------------+
|  0|        null|    2022-02-02|2022-02-02 00:00:00|
+---+------------+--------------+-------------------+



In [29]:
# Comparando datas
df_date.select(
    "today",
    lit("2022-02-23").alias("tomorrow"),
    col("today") > lit("2022-02-23")
).show()

+----------+----------+--------------------+
|     today|  tomorrow|(today > 2022-02-23)|
+----------+----------+--------------------+
|2022-02-24|2022-02-23|                true|
+----------+----------+--------------------+



# Trabalhando com Dados Nulos

In [30]:
# Importando funções
from pyspark.sql.functions import coalesce

# Gerando DataFrame
df_null = spark.createDataFrame(
    [(1, None), (None, 2)], ["col_1", "col_2"]
)

# Aplicando função
df_null.select(
    "col_1", "col_2", 
    coalesce("col_1", "col_2")
).show()

+-----+-----+----------------------+
|col_1|col_2|coalesce(col_1, col_2)|
+-----+-----+----------------------+
|    1| null|                     1|
| null|    2|                     2|
+-----+-----+----------------------+



In [31]:
# Criando view
df_null.createOrReplaceTempView("tbl_null")

# Testando diversas outras funções
spark.sql("""
    SELECT
        col_1,
        ifnull(col_1, "Retorno") AS ifnull,
        nullif("teste", "teste") AS nullif,
        nvl(col_1, "Retorno") AS nvl,
        nvl2(col_1, "Retorno", "Else") AS nvl2
    FROM tbl_null
""").show()

+-----+-------+------+-------+-------+
|col_1| ifnull|nullif|    nvl|   nvl2|
+-----+-------+------+-------+-------+
|    1|      1|  null|      1|Retorno|
| null|Retorno|  null|Retorno|   Else|
+-----+-------+------+-------+-------+



In [32]:
# Adicionando coluna totalmente nula
df_null = df_null.withColumn("col_3", lit(None))

# Utilizando drop
df_null.na.drop().show()
df_null.na.drop("all").show()
df_null.na.drop("all", subset=["col_2", "col_3"]).show()

+-----+-----+-----+
|col_1|col_2|col_3|
+-----+-----+-----+
+-----+-----+-----+

+-----+-----+-----+
|col_1|col_2|col_3|
+-----+-----+-----+
|    1| null| null|
| null|    2| null|
+-----+-----+-----+

+-----+-----+-----+
|col_1|col_2|col_3|
+-----+-----+-----+
| null|    2| null|
+-----+-----+-----+



In [33]:
# Preenchendo dados nulos
df_null = df_null.select("col_1", "col_2")
df_null.na.fill(0).show()

# Preenchendo dados nulos com subset
df_null.na.fill(0, subset=["col_1"]).show()

# Preenchendo dados nulos com dicionário
fill_dict = {"col_1": -1, "col_2": 10}
df_null.na.fill(fill_dict).show()

+-----+-----+
|col_1|col_2|
+-----+-----+
|    1|    0|
|    0|    2|
+-----+-----+

+-----+-----+
|col_1|col_2|
+-----+-----+
|    1| null|
|    0|    2|
+-----+-----+

+-----+-----+
|col_1|col_2|
+-----+-----+
|    1|   10|
|   -1|    2|
+-----+-----+



# Tipos Primitivos Complexos

## Struct

In [34]:
# Importando função
from pyspark.sql.functions import struct

# Criando DataFrame com tipo complexo
df_complex = df.select(
    "Description", 
    "InvoiceNo", 
    struct("Description", "InvoiceNo").alias("complex")
)
df_complex.show(5, truncate=False)

# Criando view
df_complex.createOrReplaceTempView("df_complex")

+-----------------------------------+---------+---------------------------------------------+
|Description                        |InvoiceNo|complex                                      |
+-----------------------------------+---------+---------------------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |536365   |{WHITE HANGING HEART T-LIGHT HOLDER, 536365} |
|WHITE METAL LANTERN                |536365   |{WHITE METAL LANTERN, 536365}                |
|CREAM CUPID HEARTS COAT HANGER     |536365   |{CREAM CUPID HEARTS COAT HANGER, 536365}     |
|KNITTED UNION FLAG HOT WATER BOTTLE|536365   |{KNITTED UNION FLAG HOT WATER BOTTLE, 536365}|
|RED WOOLLY HOTTIE WHITE HEART.     |536365   |{RED WOOLLY HOTTIE WHITE HEART., 536365}     |
+-----------------------------------+---------+---------------------------------------------+
only showing top 5 rows



In [35]:
# Selecionando atributos de campo struct
df_complex.select("complex.Description").show(5)

# Selecionando todos os atributos de um struct
df_complex.select("complex.*").show(5)

# Selecionando atributo de um struct via getField
df_complex.select(col("complex").getField("InvoiceNo")).show(5)

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
|KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|
+--------------------+
only showing top 5 rows

+--------------------+---------+
|         Description|InvoiceNo|
+--------------------+---------+
|WHITE HANGING HEA...|   536365|
| WHITE METAL LANTERN|   536365|
|CREAM CUPID HEART...|   536365|
|KNITTED UNION FLA...|   536365|
|RED WOOLLY HOTTIE...|   536365|
+--------------------+---------+
only showing top 5 rows

+-----------------+
|complex.InvoiceNo|
+-----------------+
|           536365|
|           536365|
|           536365|
|           536365|
|           536365|
+-----------------+
only showing top 5 rows



## Arrays

In [36]:
# Importando função
from pyspark.sql.functions import split

# Splitando texto
df.select("Description", split("Description", " ")).show(5, truncate=False)

+-----------------------------------+------------------------------------------+
|Description                        |split(Description,  , -1)                 |
+-----------------------------------+------------------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |[WHITE, HANGING, HEART, T-LIGHT, HOLDER]  |
|WHITE METAL LANTERN                |[WHITE, METAL, LANTERN]                   |
|CREAM CUPID HEARTS COAT HANGER     |[CREAM, CUPID, HEARTS, COAT, HANGER]      |
|KNITTED UNION FLAG HOT WATER BOTTLE|[KNITTED, UNION, FLAG, HOT, WATER, BOTTLE]|
|RED WOOLLY HOTTIE WHITE HEART.     |[RED, WOOLLY, HOTTIE, WHITE, HEART.]      |
+-----------------------------------+------------------------------------------+
only showing top 5 rows



In [37]:
# Criando DataFrame para uso com arrays
df_array = df.select(split("Description", " ").alias("split_desc"))

# Selecionando elementos
df_array.selectExpr("split_desc[0] AS first_word").show(5)

+----------+
|first_word|
+----------+
|     WHITE|
|     WHITE|
|     CREAM|
|   KNITTED|
|       RED|
+----------+
only showing top 5 rows



In [38]:
# Importando função
from pyspark.sql.functions import size

# Elementos em um array
df_array.select("split_desc", size("split_desc")).show(5, False)

+------------------------------------------+----------------+
|split_desc                                |size(split_desc)|
+------------------------------------------+----------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]  |5               |
|[WHITE, METAL, LANTERN]                   |3               |
|[CREAM, CUPID, HEARTS, COAT, HANGER]      |5               |
|[KNITTED, UNION, FLAG, HOT, WATER, BOTTLE]|6               |
|[RED, WOOLLY, HOTTIE, WHITE, HEART.]      |5               |
+------------------------------------------+----------------+
only showing top 5 rows



In [39]:
# Importando função
from pyspark.sql.functions import array_contains

# Verificando se o array contém determinado valor
df_array.select("split_desc", array_contains("split_desc", "WHITE")).show(5, False)

+------------------------------------------+---------------------------------+
|split_desc                                |array_contains(split_desc, WHITE)|
+------------------------------------------+---------------------------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]  |true                             |
|[WHITE, METAL, LANTERN]                   |true                             |
|[CREAM, CUPID, HEARTS, COAT, HANGER]      |false                            |
|[KNITTED, UNION, FLAG, HOT, WATER, BOTTLE]|false                            |
|[RED, WOOLLY, HOTTIE, WHITE, HEART.]      |true                             |
+------------------------------------------+---------------------------------+
only showing top 5 rows



In [40]:
# Importando função
from pyspark.sql.functions import explode

# "Explodindo" array em múltiplas linhas
df_array.select(
    "split_desc",
    explode("split_desc").alias("exploded")
).show(13, truncate=False)

+----------------------------------------+--------+
|split_desc                              |exploded|
+----------------------------------------+--------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|WHITE   |
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|HANGING |
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|HEART   |
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|T-LIGHT |
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|HOLDER  |
|[WHITE, METAL, LANTERN]                 |WHITE   |
|[WHITE, METAL, LANTERN]                 |METAL   |
|[WHITE, METAL, LANTERN]                 |LANTERN |
|[CREAM, CUPID, HEARTS, COAT, HANGER]    |CREAM   |
|[CREAM, CUPID, HEARTS, COAT, HANGER]    |CUPID   |
|[CREAM, CUPID, HEARTS, COAT, HANGER]    |HEARTS  |
|[CREAM, CUPID, HEARTS, COAT, HANGER]    |COAT    |
|[CREAM, CUPID, HEARTS, COAT, HANGER]    |HANGER  |
+----------------------------------------+--------+
only showing top 13 rows



In [41]:
# Criando view
df.select("Description").createOrReplaceTempView("vw_description")

# Explodindo array em SQL com LATERAL VIEW
spark.sql("""
    SELECT
        Description,
        split(Description, " ") AS splitted,
        exploded
    
    FROM vw_description
    
    LATERAL VIEW
        explode(split(Description, " ")) t AS exploded
""").show(8, truncate=False)

+----------------------------------+----------------------------------------+--------+
|Description                       |splitted                                |exploded|
+----------------------------------+----------------------------------------+--------+
|WHITE HANGING HEART T-LIGHT HOLDER|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|WHITE   |
|WHITE HANGING HEART T-LIGHT HOLDER|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|HANGING |
|WHITE HANGING HEART T-LIGHT HOLDER|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|HEART   |
|WHITE HANGING HEART T-LIGHT HOLDER|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|T-LIGHT |
|WHITE HANGING HEART T-LIGHT HOLDER|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|HOLDER  |
|WHITE METAL LANTERN               |[WHITE, METAL, LANTERN]                 |WHITE   |
|WHITE METAL LANTERN               |[WHITE, METAL, LANTERN]                 |METAL   |
|WHITE METAL LANTERN               |[WHITE, METAL, LANTERN]                 |LANTERN |
+----------------------------------+-------

## Map

In [42]:
# Importando função
from pyspark.sql.functions import create_map

# Criando um campo com map
df_map = df.select(
    "Description",
    "InvoiceNo",
    create_map("Description", "InvoiceNo").alias("complex_map")
)
df_map.show(5, truncate=False)

+-----------------------------------+---------+-----------------------------------------------+
|Description                        |InvoiceNo|complex_map                                    |
+-----------------------------------+---------+-----------------------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |536365   |{WHITE HANGING HEART T-LIGHT HOLDER -> 536365} |
|WHITE METAL LANTERN                |536365   |{WHITE METAL LANTERN -> 536365}                |
|CREAM CUPID HEARTS COAT HANGER     |536365   |{CREAM CUPID HEARTS COAT HANGER -> 536365}     |
|KNITTED UNION FLAG HOT WATER BOTTLE|536365   |{KNITTED UNION FLAG HOT WATER BOTTLE -> 536365}|
|RED WOOLLY HOTTIE WHITE HEART.     |536365   |{RED WOOLLY HOTTIE WHITE HEART. -> 536365}     |
+-----------------------------------+---------+-----------------------------------------------+
only showing top 5 rows



In [43]:
# Selecionando valores em um campo map
df_map.selectExpr("complex_map['WHITE METAL LANTERN']").show(3)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
|                            null|
+--------------------------------+
only showing top 3 rows



In [44]:
# 'Explodindo' valores em um campo map
df_map.select("Description", "InvoiceNo", explode("complex_map")).show(3, truncate=False)

+----------------------------------+---------+----------------------------------+------+
|Description                       |InvoiceNo|key                               |value |
+----------------------------------+---------+----------------------------------+------+
|WHITE HANGING HEART T-LIGHT HOLDER|536365   |WHITE HANGING HEART T-LIGHT HOLDER|536365|
|WHITE METAL LANTERN               |536365   |WHITE METAL LANTERN               |536365|
|CREAM CUPID HEARTS COAT HANGER    |536365   |CREAM CUPID HEARTS COAT HANGER    |536365|
+----------------------------------+---------+----------------------------------+------+
only showing top 3 rows



In [45]:
# Criando view
df.select("Description", "InvoiceNo").createOrReplaceTempView("vw_desc_invoice")

# Em SQL
spark.sql("""
    SELECT
        map(Description, InvoiceNo) AS complex_map,
        explode(map(Description, InvoiceNo)) AS (desc, invoice)
        
    FROM vw_desc_invoice
    
""").show(5, truncate=False)

+-----------------------------------------------+-----------------------------------+-------+
|complex_map                                    |desc                               |invoice|
+-----------------------------------------------+-----------------------------------+-------+
|{WHITE HANGING HEART T-LIGHT HOLDER -> 536365} |WHITE HANGING HEART T-LIGHT HOLDER |536365 |
|{WHITE METAL LANTERN -> 536365}                |WHITE METAL LANTERN                |536365 |
|{CREAM CUPID HEARTS COAT HANGER -> 536365}     |CREAM CUPID HEARTS COAT HANGER     |536365 |
|{KNITTED UNION FLAG HOT WATER BOTTLE -> 536365}|KNITTED UNION FLAG HOT WATER BOTTLE|536365 |
|{RED WOOLLY HOTTIE WHITE HEART. -> 536365}     |RED WOOLLY HOTTIE WHITE HEART.     |536365 |
+-----------------------------------------------+-----------------------------------+-------+
only showing top 5 rows



# Trabalhando com JSON

In [53]:
# Criando coluna JSON
df_json = spark.range(1)\
    .selectExpr("""
        '{"myJSONKey": {"myJSONValue": [1, 2, 3]}}' as jsonString
    """)
df_json.show(truncate=False)

+-----------------------------------------+
|jsonString                               |
+-----------------------------------------+
|{"myJSONKey": {"myJSONValue": [1, 2, 3]}}|
+-----------------------------------------+



In [62]:
# Importando funções
from pyspark.sql.functions import get_json_object, json_tuple

# Retornando valores JSON
df_json.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias("column"),
    json_tuple(col("jsonString"), "myJSONKey")
).show(truncate=False)

+------+-----------------------+
|column|c0                     |
+------+-----------------------+
|2     |{"myJSONValue":[1,2,3]}|
+------+-----------------------+



In [67]:
# Transformando StructType em JSON
from pyspark.sql.functions import to_json

df.selectExpr("(InvoiceNo, Description) as myStruct")\
    .select(to_json(col('myStruct')))\
    .show(5, truncate=False)

+--------------------------------------------------------------------------+
|to_json(myStruct)                                                         |
+--------------------------------------------------------------------------+
|{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"} |
|{"InvoiceNo":"536365","Description":"WHITE METAL LANTERN"}                |
|{"InvoiceNo":"536365","Description":"CREAM CUPID HEARTS COAT HANGER"}     |
|{"InvoiceNo":"536365","Description":"KNITTED UNION FLAG HOT WATER BOTTLE"}|
|{"InvoiceNo":"536365","Description":"RED WOOLLY HOTTIE WHITE HEART."}     |
+--------------------------------------------------------------------------+
only showing top 5 rows



In [73]:
# Importando bibliotecas
from pyspark.sql.functions import from_json
from pyspark.sql.types import *

# Definindo schema
parseSchema = StructType((
    StructField("InvoiceNo", StringType(), True),
    StructField("Description", StringType(), True)
))

# Criando consulta para conversão de json em Struct
df.selectExpr("(InvoiceNo, Description) as myStruct")\
    .select(to_json(col("myStruct")).alias("newJSON"))\
    .select(col("newJSON"), from_json(col("newJSON"), parseSchema))\
    .show(2, truncate=True)

+--------------------+--------------------+
|             newJSON|  from_json(newJSON)|
+--------------------+--------------------+
|{"InvoiceNo":"536...|{536365, WHITE HA...|
|{"InvoiceNo":"536...|{536365, WHITE ME...|
+--------------------+--------------------+
only showing top 2 rows



In [75]:
# Definindo função
def power3(value):
    return value ** 3

power3(2.0)

8.0

In [79]:
# Importando função
from pyspark.sql.functions import udf

# Registrando udf
power3udf = udf(power3)

# Executando udf
df_udf = spark.range(5).toDF("num")
df_udf.select("num", power3udf(col("num"))).show()

+---+-----------+
|num|power3(num)|
+---+-----------+
|  0|          0|
|  1|          1|
|  2|          8|
|  3|         27|
|  4|         64|
+---+-----------+



In [85]:
# Registrando função para uso em SQL
spark.udf.register("power3", power3)

# Executando função como uma expressão
df_udf.selectExpr("power3(num)").show(3)

# Executando função com SparkSQL
df_udf.createOrReplaceTempView("df_udf")
spark.sql("""SELECT power3(num) FROM df_udf""").show(3)

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
|          8|
+-----------+
only showing top 3 rows

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
|          8|
+-----------+
only showing top 3 rows



In [87]:
# Registrando função com retorno explícito
spark.udf.register("power3_double", power3, DoubleType())

# Executando função em inteiros
df_udf.selectExpr("num", "power3_double(num)").show()

+---+------------------+
|num|power3_double(num)|
+---+------------------+
|  0|              null|
|  1|              null|
|  2|              null|
|  3|              null|
|  4|              null|
+---+------------------+



In [88]:
# Registrando função com casting explícito
spark.udf.register("power3_final", lambda x: float(x ** 3), DoubleType())

# Executando função em inteiros
df_udf.selectExpr("num", "power3_final(num)").show()

+---+-----------------+
|num|power3_final(num)|
+---+-----------------+
|  0|              0.0|
|  1|              1.0|
|  2|              8.0|
|  3|             27.0|
|  4|             64.0|
+---+-----------------+

