**_Objetivo:_** Neste notebook, serão consolidados códigos para explorações práticas envolvendo o contéudo presente no capítulo 7 do livro Spark - The Definitive Guide: Aggregations. No cenário proposto, exemplos de operações de agregação e sumarização de dados são fornecidos de modo a propor um entendimento claro sobre como tais transformações podem ser úteis no dia a dia de construção de fluxos de ETL e extração de insights de dados.

In [1]:
# Importando bibliotecas
from pyspark.sql import SparkSession
import os

# Inicializando sessão spark
spark = SparkSession.builder.getOrCreate()
spark

In [2]:
# Definindo variáveis de diretório
DATA_PATH = '../book-github-resources/Spark-The-Definitive-Guide-master/data/retail-data/all/online-retail-dataset.csv'

# Lendo base de dados
df = spark.read\
    .format('csv')\
    .option('inferSchema', 'true')\
    .option('header', 'true')\
    .load(DATA_PATH)

# Aplicando cache e criando view
df.cache()
df.createOrReplaceTempView('vw_retail')

# Visualizando amostra
df.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows



In [3]:
# Exemplo de agragação simples
df.count()

541909

# Funções de Agregação

## count

In [4]:
# Importando função
from pyspark.sql.functions import count

# count como ação
print(f'Ação: {df.count()}\n')

# count como transformação
df.select(count("*")).show()

# count via SparkSQL
spark.sql("""
    SELECT count(1) FROM vw_retail
""").show()

Ação: 541909

+--------+
|count(1)|
+--------+
|  541909|
+--------+

+--------+
|count(1)|
+--------+
|  541909|
+--------+



## countDistinct

In [5]:
# Importando função
from pyspark.sql.functions import countDistinct

# Contando valores distintos
df.select(countDistinct("StockCode")).show()

# utilizando SparkSQL
spark.sql("""
    SELECT COUNT(DISTINCT StockCode) FROM vw_retail
""").show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



## aprox_count_distinct

In [6]:
# Importando função
from pyspark.sql.functions import approx_count_distinct

# Contando valores distintos com taxa de erro permitida
df.select(approx_count_distinct("StockCode", 0.1)).show()

# Via SparkSQL
spark.sql("""
    SELECT approx_count_distinct(StockCode, 0.1) FROM vw_retail
""").show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



## first e last

In [7]:
# Importando funções
from pyspark.sql.functions import first, last

# Retornando valores
df.select(
    first("StockCode").alias("first"),
    last("StockCode").alias("last")
).show()

# Via SparkSQL
spark.sql("""
    SELECT
        first(StockCode) AS first,
        last(StockCode) AS last
    FROM vw_retail
""").show()

+------+-----+
| first| last|
+------+-----+
|85123A|22138|
+------+-----+

+------+-----+
| first| last|
+------+-----+
|85123A|22138|
+------+-----+



## min e max

In [8]:
# Importando funções
from pyspark.sql.functions import min, max

# Executando consulta
df.select(
    min("Quantity").alias("min"),
    max("Quantity").alias("max")
).show()

# Via SparkSQL
spark.sql("""
    SELECT
        min(Quantity) AS min,
        max(Quantity) AS max
    FROM vw_retail
""").show()

+------+-----+
|   min|  max|
+------+-----+
|-80995|80995|
+------+-----+

+------+-----+
|   min|  max|
+------+-----+
|-80995|80995|
+------+-----+



## sum

In [9]:
# Importando função
from pyspark.sql.functions import sum

# Realizando consulta
df.select(sum("Quantity")).show()

# Via SparkSQL
spark.sql("""
    SELECT sum(Quantity) FROM vw_retail
""").show()

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



## sumDistinct

In [10]:
# Importando função
from pyspark.sql.functions import sum_distinct

# Realizando consulta
df.select(sum_distinct("Quantity")).show()

# Via SparkSQL
spark.sql("""
    SELECT sum(DISTINCT Quantity) FROM vw_retail
""").show()

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+

+----------------------+
|sum(DISTINCT Quantity)|
+----------------------+
|                 29310|
+----------------------+



## avg

In [11]:
# Importando funções
from pyspark.sql.functions import sum, count, avg, expr

# Realizando seleção
df.select(
    (sum("Quantity") / count("Quantity")).alias("sum/count"),
    avg("Quantity").alias("avg"),
    expr("mean(Quantity)").alias("mean")
).show()

+----------------+----------------+----------------+
|       sum/count|             avg|            mean|
+----------------+----------------+----------------+
|9.55224954743324|9.55224954743324|9.55224954743324|
+----------------+----------------+----------------+



## var e stddev

In [12]:
# Importando funções
from pyspark.sql.functions import variance, stddev, \
                                  var_pop, stddev_pop, \
                                  var_samp, stddev_samp

# Realizando consultas
df.select(
    var_samp("Quantity").alias("var_samp"),
        stddev_samp("Quantity").alias("stddev_samp"),
    var_pop("Quantity").alias("var_pop"),
    stddev_pop("Quantity").alias("stddev_pop")
).show()

# Sem especificação do público
df.select(
    variance("Quantity").alias("variance"),
    stddev("Quantity").alias("stddev")
).show()

+-----------------+------------------+-----------------+------------------+
|         var_samp|       stddev_samp|          var_pop|        stddev_pop|
+-----------------+------------------+-----------------+------------------+
|47559.39140929892|218.08115785023455|47559.30364660923|218.08095663447835|
+-----------------+------------------+-----------------+------------------+

+-----------------+------------------+
|         variance|            stddev|
+-----------------+------------------+
|47559.39140929892|218.08115785023455|
+-----------------+------------------+



## skewness e kurtosis

In [13]:
# Importando funções
from pyspark.sql.functions import skewness, kurtosis

# Definindo consulta
df.select(
    skewness("Quantity"),
    kurtosis("Quantity")
).show()

# Via SparkSQL
spark.sql("""
    SELECT 
        skewness(Quantity),
        kurtosis(Quantity)
    FROM vw_retail
""").show()

+--------------------+------------------+
|  skewness(Quantity)|kurtosis(Quantity)|
+--------------------+------------------+
|-0.26407557610528376|119768.05495530753|
+--------------------+------------------+

+--------------------+------------------+
|  skewness(Quantity)|kurtosis(Quantity)|
+--------------------+------------------+
|-0.26407557610528376|119768.05495530753|
+--------------------+------------------+



## corr e covar

In [14]:
# Importando funções
from pyspark.sql.functions import corr, covar_pop, covar_samp

# Definindo consulta
df.select(
    corr("Quantity", "UnitPrice").alias("corr"),
    covar_pop("Quantity", "UnitPrice").alias("covar_pop"),
    covar_samp("Quantity", "UnitPrice").alias("covar_samp")
).show()

+--------------------+-------------------+-------------------+
|                corr|          covar_pop|         covar_samp|
+--------------------+-------------------+-------------------+
|-0.00123492454487...|-26.058713170968105|-26.058761257937057|
+--------------------+-------------------+-------------------+



## Agregando Tipos Complexos

In [15]:
# Importando funções
from pyspark.sql.functions import collect_set, collect_list, size

# Definido consultas
df.select(
    collect_set("Country"),
    collect_list("Country")
).show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Portugal, Italy,...| [United Kingdom, ...|
+--------------------+---------------------+



# Agrupamentos

In [18]:
# Importando funçõpes
from pyspark.sql.functions import count, sum, avg

# Definindo consulta com groupBy
df.groupBy("InvoiceNo", "CustomerId").agg(
    count("Quantity").alias("count_qty"),
    sum("Quantity").alias("sum_qty")
).show(5)

+---------+----------+---------+-------+
|InvoiceNo|CustomerId|count_qty|sum_qty|
+---------+----------+---------+-------+
|   536846|     14573|       76|    134|
|   537026|     12395|       12|    528|
|   537883|     14437|        5|     60|
|   538068|     17978|       12|    499|
|   538279|     14952|        7|    472|
+---------+----------+---------+-------+
only showing top 5 rows



In [20]:
# Agrupando com expressões
df.groupBy("InvoiceNo", "CustomerId").agg(
    expr("round(avg(Quantity), 2) AS avg_qty"),
    expr("round(stddev(Quantity), 2) AS stddev")
).show(5)

+---------+----------+-------+------+
|InvoiceNo|CustomerId|avg_qty|stddev|
+---------+----------+-------+------+
|   536846|     14573|   1.76|  1.61|
|   537026|     12395|   44.0| 48.29|
|   537883|     14437|   12.0|   0.0|
|   538068|     17978|  41.58|123.03|
|   538279|     14952|  67.43|  76.8|
+---------+----------+-------+------+
only showing top 5 rows



In [22]:
# Realizando os mesmos cálculos via SparkSQL
spark.sql("""
    SELECT
        InvoiceNo,
        CustomerId,
        count(Quantity) AS count,
        sum(Quantity) AS sum_qty,
        round(avg(Quantity), 2) AS avg_qty,
        round(stddev(Quantity), 2) AS stddev
    
    FROM vw_retail
    
    GROUP BY
        InvoiceNo,
        CustomerId
""").show(5)

+---------+----------+-----+-------+-------+------+
|InvoiceNo|CustomerId|count|sum_qty|avg_qty|stddev|
+---------+----------+-----+-------+-------+------+
|   536846|     14573|   76|    134|   1.76|  1.61|
|   537026|     12395|   12|    528|   44.0| 48.29|
|   537883|     14437|    5|     60|   12.0|   0.0|
|   538068|     17978|   12|    499|  41.58|123.03|
|   538279|     14952|    7|    472|  67.43|  76.8|
+---------+----------+-----+-------+-------+------+
only showing top 5 rows



In [23]:
df.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows



In [60]:
# Importando funções
from pyspark.sql.functions import col, to_date

# Transformando coluna de data
df_date = df.withColumn(
    "date", to_date("InvoiceDate", "M/d/yyyy H:mm")
)
df_date.createOrReplaceTempView("vw_retail_date")

# Verificando dados
df_date.select("InvoiceDate", "date").show(5)

+--------------+----------+
|   InvoiceDate|      date|
+--------------+----------+
|12/1/2010 8:26|2010-12-01|
|12/1/2010 8:26|2010-12-01|
|12/1/2010 8:26|2010-12-01|
|12/1/2010 8:26|2010-12-01|
|12/1/2010 8:26|2010-12-01|
+--------------+----------+
only showing top 5 rows



In [70]:
# Importando funções
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, max, rank, dense_rank

# Especificando janela de análise
windowSpec = Window\
    .partitionBy("CustomerId", "date")\
    .orderBy(desc("Quantity"))\
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [71]:
# Especificando agregação
maxPurchaseQty = max(col("Quantity")).over(windowSpec)

# Criando rankings
purchaseRank = rank().over(windowSpec)
purchaseDenseRank = dense_rank().over(windowSpec)

In [65]:
# Selecionando dados
df_date.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
    .select(
        "CustomerId",
        "date",
        "Quantity",
        purchaseRank.alias("qtyRank"),
        purchaseDenseRank.alias("qtyDenseRank"),
        maxPurchaseQty.alias("maxPurchaseQty")
    ).show(20)

+----------+----------+--------+-------+------------+--------------+
|CustomerId|      date|Quantity|qtyRank|qtyDenseRank|maxPurchaseQty|
+----------+----------+--------+-------+------------+--------------+
|     12346|2011-01-18|   74215|      1|           1|         74215|
|     12346|2011-01-18|  -74215|      2|           2|         74215|
|     12347|2010-12-07|      36|      1|           1|            36|
|     12347|2010-12-07|      30|      2|           2|            36|
|     12347|2010-12-07|      24|      3|           3|            36|
|     12347|2010-12-07|      12|      4|           4|            36|
|     12347|2010-12-07|      12|      4|           4|            36|
|     12347|2010-12-07|      12|      4|           4|            36|
|     12347|2010-12-07|      12|      4|           4|            36|
|     12347|2010-12-07|      12|      4|           4|            36|
|     12347|2010-12-07|      12|      4|           4|            36|
|     12347|2010-12-07|      12|  

In [69]:
# Aplicando a mesma análise via SparkSQL
spark.sql("""
    SELECT
        CustomerId,
        date,
        Quantity,
        
        rank(Quantity) OVER (PARTITION BY CustomerId, date
                             ORDER BY Quantity DESC NULLS LAST
                             ROWS BETWEEN
                                 UNBOUNDED PRECEDING AND
                                 CURRENT ROW) AS rank,
                                 
        dense_rank(Quantity) OVER (PARTITION BY CustomerId, date
                                   ORDER BY Quantity DESC NULLS LAST
                                    ROWS BETWEEN
                                        UNBOUNDED PRECEDING AND
                                        CURRENT ROW) AS dRank,
        
        max(Quantity) OVER (PARTITION BY CustomerId, date
                            ORDER BY Quantity DESC NULLS LAST
                            ROWS BETWEEN
                                UNBOUNDED PRECEDING AND
                                CURRENT ROW) AS maxPurchase
                                
    FROM vw_retail_date WHERE CustomerId IS NOT NULL ORDER BY CustomerId

""").show(20)

+----------+----------+--------+----+-----+-----------+
|CustomerId|      date|Quantity|rank|dRank|maxPurchase|
+----------+----------+--------+----+-----+-----------+
|     12346|2011-01-18|   74215|   1|    1|      74215|
|     12346|2011-01-18|  -74215|   2|    2|      74215|
|     12347|2010-12-07|      36|   1|    1|         36|
|     12347|2010-12-07|      30|   2|    2|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      24|   3|    3|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|         36|
|     12347|2010-12-07|      12|   4|    4|     

# Grouping Sets

In [88]:
# Dropando nulos e criando nova view
df_not_null = df_date.na.drop()
df_not_null.createOrReplaceTempView("vw_retail_not_null")

# Aplicando grouping set
spark.sql("""
    SELECT
        CustomerId,
        StockCode,
        sum(Quantity)
        
    FROM vw_retail_not_null
    
    GROUP BY CustomerId, StockCode
    
    GROUPING SETS((CustomerId, StockCode))
    
    ORDER BY CustomerId DESC, StockCode DESC
""").show(5)

+----------+---------+-------------+
|CustomerId|StockCode|sum(Quantity)|
+----------+---------+-------------+
|     18287|    85173|           48|
|     18287|   85040A|           48|
|     18287|   85039B|          120|
|     18287|   85039A|           96|
|     18287|    84920|            4|
+----------+---------+-------------+
only showing top 5 rows



In [105]:
spark.sql("""
    SELECT
        CustomerId,
        StockCode,
        sum(Quantity)
        
    FROM vw_retail_not_null
    
    GROUP BY CustomerId, StockCode
    
    GROUPING SETS((CustomerId, StockCode), (CustomerId), ())
    
    ORDER BY sum(Quantity) DESC, CustomerId DESC, StockCode DESC
""").show(5)

+----------+---------+-------------+
|CustomerId|StockCode|sum(Quantity)|
+----------+---------+-------------+
|      null|     null|      4906888|
|     14646|     null|       196719|
|     12415|     null|        77242|
|     14911|     null|        77180|
|     17450|     null|        69029|
+----------+---------+-------------+
only showing top 5 rows



## Rollups

In [107]:
df_rollup = df_not_null.rollup("date", "Country").agg(sum("Quantity"))\
    .selectExpr("Date", "Country", "`sum(Quantity)` AS total_quantity")\
    .orderBy("Date")
df_rollup.show(5)

+----------+--------------+--------------+
|      Date|       Country|total_quantity|
+----------+--------------+--------------+
|      null|          null|       4906888|
|2010-12-01|United Kingdom|         21167|
|2010-12-01|          null|         24032|
|2010-12-01|     Australia|           107|
|2010-12-01|        France|           449|
+----------+--------------+--------------+
only showing top 5 rows



In [111]:
# Aplicando rollup
df_date.rollup("date", "Country")\
    .agg(sum("Quantity"))\
    .orderBy("Date")\
    .show(5)

+----------+--------------+-------------+
|      date|       Country|sum(Quantity)|
+----------+--------------+-------------+
|      null|          null|      5176450|
|2010-12-01|United Kingdom|        23949|
|2010-12-01|          null|        26814|
|2010-12-01|     Australia|          107|
|2010-12-01|        France|          449|
+----------+--------------+-------------+
only showing top 5 rows



In [113]:
df_date.select(sum("Quantity")).show()

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



## Cube

In [115]:
# Aplicando agregações multidimensionais
df_date.cube("Date", "Country")\
    .agg(sum("Quantity"))\
    .orderBy("Date")\
    .show(5)

+----+--------------------+-------------+
|Date|             Country|sum(Quantity)|
+----+--------------------+-------------+
|null|United Arab Emirates|          982|
|null|               Italy|         7999|
|null|           Singapore|         5234|
|null|             Finland|        10666|
|null|              Greece|         1556|
+----+--------------------+-------------+
only showing top 5 rows



In [118]:
df_date.cube("Date", "Country")\
    .agg(sum("Quantity"))\
    .orderBy("Country")\
    .show(5)

+----------+-------+-------------+
|      Date|Country|sum(Quantity)|
+----------+-------+-------------+
|2011-01-11|   null|        29093|
|2011-01-13|   null|        10114|
|2010-12-19|   null|         3795|
|2011-01-06|   null|        22461|
|2011-02-10|   null|        11447|
+----------+-------+-------------+
only showing top 5 rows



In [121]:
# Criando cubo
df_cube = df_date.cube("Date", "Country")\
    .agg(sum("Quantity"))

# Totalização por país
df_cube.where("date is null").show(5)

# Totalização por data
df_cube.where("country is null").show(5)

# Totalização independente de data e país
df_cube.where("date is null AND country is null").show()

# Totalização por data e país
df_cube.where("date is not null AND country is not null").show(5)

+----+--------------------+-------------+
|Date|             Country|sum(Quantity)|
+----+--------------------+-------------+
|null|             Finland|        10666|
|null|           Lithuania|          652|
|null|              Poland|         3653|
|null|             Iceland|         2458|
|null|United Arab Emirates|          982|
+----+--------------------+-------------+
only showing top 5 rows

+----------+-------+-------------+
|      Date|Country|sum(Quantity)|
+----------+-------+-------------+
|2010-12-17|   null|        16069|
|2010-12-14|   null|        20098|
|2011-02-13|   null|         2715|
|2011-01-06|   null|        22461|
|2011-02-17|   null|        14544|
+----------+-------+-------------+
only showing top 5 rows

+----+-------+-------------+
|Date|Country|sum(Quantity)|
+----+-------+-------------+
|null|   null|      5176450|
+----+-------+-------------+

+----------+--------------+-------------+
|      Date|       Country|sum(Quantity)|
+----------+--------------+

## Pivot

In [123]:
from pyspark.sql.functions import month

In [125]:
# Criando coluna de mês e pivotando dados
df_date.withColumn("month_dt", month("date"))\
    .groupBy("Country")\
    .pivot("month_dt")\
    .agg(sum("Quantity"))\
    .show(5)

+---------+----+----+----+----+-----+----+----+----+-----+-----+-----+-----+
|  Country|   1|   2|   3|   4|    5|   6|   7|   8|    9|   10|   11|   12|
+---------+----+----+----+----+-----+----+----+----+-----+-----+-----+-----+
|   Sweden|3096| 250|5263| 310| 2829| 404|6006|1308| 4344| 6151| 1962| 3714|
|Singapore|1091|null|null|1384| null|null|2160|null| null|  599| null| null|
|  Germany|8906|4083|7675|5692|12951|7348|8991|9560|11028|17636|12922|10656|
|      RSA|null|null|null|null| null|null|null|null| null|  352| null| null|
|   France|9155|5301|8639|2216| 9780|9441|5656|7948|12912|13737|17086| 8609|
+---------+----+----+----+----+-----+----+----+----+-----+-----+-----+-----+
only showing top 5 rows

