In [2]:
# Importando bibliotecas
from pyspark.sql import SparkSession
import os

# Construindo objeto de sessão
spark = SparkSession\
    .builder\
    .appName("transformacoes-pyspark")\
    .getOrCreate()

# Definindo variáveis de diretório
home_path = os.path.expanduser("~")
data_path = os.path.join(home_path, "dev/panini-tech-lab/data/brazilian-ecommerce")

# Realizando leitura dos dados: orders
df_orders = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(os.path.join(data_path, "orders/"))

# Realizando leitura dos dados: order items
df_order_items = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(os.path.join(data_path, "order_items/"))

# Realizando leitura dos dados: order payments
df_order_payments = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(os.path.join(data_path, "order_payments/"))

# Realizando leitura dos dados: order reviews
df_order_reviews = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(os.path.join(data_path, "order_reviews/"))

# Realizando leitura dos dados: products
df_products = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(os.path.join(data_path, "products/"))

# Realizando leitura dos dados: customers
df_customers = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(os.path.join(data_path, "customers/"))

# Realizando leitura dos dados: geolocation
df_geolocation = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(os.path.join(data_path, "geolocation/"))

# Realizando leitura dos dados: sellers
df_sellers = spark.read.format("csv")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .load(os.path.join(data_path, "sellers/"))

22/09/19 14:27:43 WARN Utils: Your hostname, panini-ubuntu resolves to a loopback address: 127.0.1.1; using 10.0.0.110 instead (on interface enp3s0)
22/09/19 14:27:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/19 14:27:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


                                                                                

<div align="center">
    <br><img src="https://i.imgur.com/HRhd2Y0.png" width=750 height=450 alt="brazilian e-commerce star schema">
</div>

In [3]:
# Importando tipos primitivos
from pyspark.sql.types import StructType, StructField, \
    StringType, IntegerType

# Definindo schema
customers_schema = StructType([
    StructField("customer_id", StringType(), nullable=True, metadata={"description": "Id do cliente"}),
    StructField("customer_unique_id", StringType(), nullable=True, metadata={"description": "Id único do cliente"}),
    StructField("customer_zip_code_prefix", IntegerType(), nullable=True, metadata={"description": "Prefixo do CEP do cliente"}),
    StructField("customer_city", StringType(), nullable=True, metadata={"description": "Cidade do cliente"}),
    StructField("customer_state", StringType(), nullable=True, metadata={"description": "Estado do cliente"})
])

# Realizando a leitura dos dados
df_customers = spark.read.format("csv")\
    .option("header", "true")\
    .schema(customers_schema)\
    .load(os.path.join(data_path, "customers/"))

# Verificando amostra dos dados
df_customers.printSchema()
df_customers.show(5, truncate=False)

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



                                                                                

+--------------------------------+--------------------------------+------------------------+---------------------+--------------+
|customer_id                     |customer_unique_id              |customer_zip_code_prefix|customer_city        |customer_state|
+--------------------------------+--------------------------------+------------------------+---------------------+--------------+
|06b8999e2fba1a1fbc88172c00ba8bc7|861eff4711a542e4b93843c6dd7febb0|14409                   |franca               |SP            |
|18955e83d337fd6b2def6b18a428ac77|290c77bc529b7ac935b93aa66c333dc3|9790                    |sao bernardo do campo|SP            |
|4e7b3e00288586ebd08712fdd0374a03|060e732b5b29e8181a18229c7b0b2b5e|1151                    |sao paulo            |SP            |
|b2b6027bc5c5109e529d4dc6358b12c3|259dac757896d24d7702b9acbbff3f3c|8775                    |mogi das cruzes      |SP            |
|4f2d8ab171c80ec8364f7c12e35b23ad|345ecd01c38d18a9036ed96c73b8d066|13056                  

## Criando Consultas

### col()

In [4]:
# Importando função
from pyspark.sql.functions import col

# Referenciando colunas e criando expressões
col("anomes") * 100 + 1
col("valor_A") > col("valor_B")

# Aplicando funções à referências de colunas
from pyspark.sql.functions import lpad, lower, split

lpad(col("moeda"), 2, "")
lower(col("sigla"))
split("nome_completo", " ")

Column<'split(nome_completo,  , -1)'>

In [5]:
# Importando função
from pyspark.sql.functions import expr

# Criando expressões em formato de string
expr("(anomes * 100) + 1")
expr("valor_A > valor_B")
expr("lpad(moeda, 2, '')")
expr("lower(sigla)")
expr("split(nome_completo, '')")

Column<'split(nome_completo, )'>

### select()

In [6]:
# Importando funções
from pyspark.sql.functions import upper, datediff

# Verificando detalhes sobre o pedido
df_orders_prep = df_orders.select(
    expr("order_id AS id_pedido"),
    expr("to_date(order_purchase_timestamp) AS dt_compra"),
    expr("to_date(order_approved_at) AS dt_aprovacao"),
    expr("to_date(order_delivered_customer_date) AS dt_entrega"),
    upper(col("order_status")).alias("status_pedido"),
    datediff(col("order_delivered_customer_date"), col("order_purchase_timestamp")).alias("dias_para_entrega"),
)

# Visualizando dados
df_orders_prep.show(5)

+--------------------+----------+------------+----------+-------------+-----------------+
|           id_pedido| dt_compra|dt_aprovacao|dt_entrega|status_pedido|dias_para_entrega|
+--------------------+----------+------------+----------+-------------+-----------------+
|e481f51cbdc54678b...|2017-10-02|  2017-10-02|2017-10-10|    DELIVERED|                8|
|53cdb2fc8bc7dce0b...|2018-07-24|  2018-07-26|2018-08-07|    DELIVERED|               14|
|47770eb9100c2d0c4...|2018-08-08|  2018-08-08|2018-08-17|    DELIVERED|                9|
|949d5b44dbf5de918...|2017-11-18|  2017-11-18|2017-12-02|    DELIVERED|               14|
|ad21c59c0840e6cb8...|2018-02-13|  2018-02-13|2018-02-16|    DELIVERED|                3|
+--------------------+----------+------------+----------+-------------+-----------------+
only showing top 5 rows



### selectExpr()

In [7]:
# Verificando detalhes sobre o pedido
df_orders_prep = df_orders.selectExpr(
    "order_id AS id_pedido",
    "to_date(order_purchase_timestamp) AS dt_compra",
    "to_date(order_approved_at) AS dt_aprovacao",
    "to_date(order_delivered_customer_date) AS dt_entrega",
    "upper(order_status) AS status_pedido",
    "datediff(order_delivered_customer_date, order_purchase_timestamp) AS dias_para_entrega",
    "case when upper(order_status) = 'DELIVERED' then 1 else 0 end as flag_entregue"
)

# Visualizando dados
df_orders_prep.show(5)

+--------------------+----------+------------+----------+-------------+-----------------+-------------+
|           id_pedido| dt_compra|dt_aprovacao|dt_entrega|status_pedido|dias_para_entrega|flag_entregue|
+--------------------+----------+------------+----------+-------------+-----------------+-------------+
|e481f51cbdc54678b...|2017-10-02|  2017-10-02|2017-10-10|    DELIVERED|                8|            1|
|53cdb2fc8bc7dce0b...|2018-07-24|  2018-07-26|2018-08-07|    DELIVERED|               14|            1|
|47770eb9100c2d0c4...|2018-08-08|  2018-08-08|2018-08-17|    DELIVERED|                9|            1|
|949d5b44dbf5de918...|2017-11-18|  2017-11-18|2017-12-02|    DELIVERED|               14|            1|
|ad21c59c0840e6cb8...|2018-02-13|  2018-02-13|2018-02-16|    DELIVERED|                3|            1|
+--------------------+----------+------------+----------+-------------+-----------------+-------------+
only showing top 5 rows



## Operações em colunas

### withColumn()

In [8]:
from pyspark.sql.functions import concat

In [9]:
from pyspark.sql.functions import split

# Adicionando colunas em um DataFrame
df_payments_prep = df_order_payments\
    .withColumn("moeda", expr("'R$'"))\
    .withColumn("vlr_pgto_moeda", expr("concat(moeda, cast(payment_value AS string))"))\
    .withColumn("tipo_pagamento", split(col("payment_type"), "_")[0])

# Visualizando dados
df_payments_prep.show(5)

+--------------------+------------------+------------+--------------------+-------------+-----+--------------+--------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|moeda|vlr_pgto_moeda|tipo_pagamento|
+--------------------+------------------+------------+--------------------+-------------+-----+--------------+--------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|   R$|       R$99.33|        credit|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|   R$|       R$24.39|        credit|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|   R$|       R$65.71|        credit|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|   R$|      R$107.78|        credit|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|   R$|      R$128.45|        

### withColumnRenamed()

In [10]:
# Tratando base de pagamentos através de métodos característicos
df_payments_prep = df_order_payments\
    .withColumnRenamed("order_id", "id_pedido")\
    .withColumnRenamed("payment_sequential", "parcela_pgto")\
    .withColumnRenamed("payment_type", "tipo_pgto")\
    .withColumnRenamed("payment_installments", "qtd_parcelas")\
    .withColumnRenamed("payment_value", "vlr_pgto")\
    .withColumn("moeda", expr("'R$'"))

# Visualizando resultado
df_payments_prep.show(5)

+--------------------+------------+-----------+------------+--------+-----+
|           id_pedido|parcela_pgto|  tipo_pgto|qtd_parcelas|vlr_pgto|moeda|
+--------------------+------------+-----------+------------+--------+-----+
|b81ef226f3fe1789b...|           1|credit_card|           8|   99.33|   R$|
|a9810da82917af2d9...|           1|credit_card|           1|   24.39|   R$|
|25e8ea4e93396b6fa...|           1|credit_card|           1|   65.71|   R$|
|ba78997921bbcdc13...|           1|credit_card|           8|  107.78|   R$|
|42fdf880ba16b47b5...|           1|credit_card|           2|  128.45|   R$|
+--------------------+------------+-----------+------------+--------+-----+
only showing top 5 rows



### alias()

In [11]:
# Criando consulta e adicionando alias 
df_sellers_prep = df_sellers.select(
    col("seller_id").alias("id_vendedor"),
    col("seller_city").alias("cidade_vendedor"),
    col("seller_state").alias("estado_vendedor")
)

# Visualizando dados
df_sellers_prep.show(5, truncate=False)

+--------------------------------+-----------------+---------------+
|id_vendedor                     |cidade_vendedor  |estado_vendedor|
+--------------------------------+-----------------+---------------+
|3442f8959a84dea7ee197c632cb2df15|campinas         |SP             |
|d1b65fc7debc3361ea86b5f14c68d2e2|mogi guacu       |SP             |
|ce3ad9de960102d0677a81f5d0bb7b2d|rio de janeiro   |RJ             |
|c0f3eea2e14555b6faeea3dd58c1b1c3|sao paulo        |SP             |
|51a04a8a6bdcb23deccc82b0b80742cf|braganca paulista|SP             |
+--------------------------------+-----------------+---------------+
only showing top 5 rows



### drop()

In [12]:
# DataFrame original de geolocalização
df_geolocation.show(5)

# Eliminando colunas de cidade
df_geo_dropped = df_geolocation.drop("geolocation_state")
df_geo_dropped.show(5)

+---------------------------+-------------------+------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|   geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+------------------+----------------+-----------------+
|                       1037| -23.54562128115268|-46.63929204800168|       sao paulo|               SP|
|                       1046|-23.546081127035535|-46.64482029837157|       sao paulo|               SP|
|                       1046| -23.54612896641469|-46.64295148361138|       sao paulo|               SP|
|                       1041|  -23.5443921648681|-46.63949930627844|       sao paulo|               SP|
|                       1035|-23.541577961711493|-46.64160722329613|       sao paulo|               SP|
+---------------------------+-------------------+------------------+----------------+-----------------+
only showing top 5 rows

+---------------------------+----------

In [13]:
# Eliminando múltiplas colunas
df_geolocation.drop("geolocation_city", "geolocation_state").show(5)

+---------------------------+-------------------+------------------+
|geolocation_zip_code_prefix|    geolocation_lat|   geolocation_lng|
+---------------------------+-------------------+------------------+
|                       1037| -23.54562128115268|-46.63929204800168|
|                       1046|-23.546081127035535|-46.64482029837157|
|                       1046| -23.54612896641469|-46.64295148361138|
|                       1041|  -23.5443921648681|-46.63949930627844|
|                       1035|-23.541577961711493|-46.64160722329613|
+---------------------------+-------------------+------------------+
only showing top 5 rows



### lit()

In [14]:
# Importando função
from pyspark.sql.functions import lit, when

# Analisando pagamentos online
df_pgtos = df_order_payments.select(
    col("order_id").alias("id_pedido"),
    lit(1).alias("flag_pgto_realizado"),
    col("payment_type").alias("tipo_pgto"),
    col("payment_installments").alias("qtd_parcelas"),
    when(col("payment_installments") >= 3, lit(1)).otherwise(lit(0)).alias("flag_pgto_maior_3x"),
    col("payment_value").alias("vlr_pgto")
)

# Visualizando dados
df_pgtos.show(5)

+--------------------+-------------------+-----------+------------+------------------+--------+
|           id_pedido|flag_pgto_realizado|  tipo_pgto|qtd_parcelas|flag_pgto_maior_3x|vlr_pgto|
+--------------------+-------------------+-----------+------------+------------------+--------+
|b81ef226f3fe1789b...|                  1|credit_card|           8|                 1|   99.33|
|a9810da82917af2d9...|                  1|credit_card|           1|                 0|   24.39|
|25e8ea4e93396b6fa...|                  1|credit_card|           1|                 0|   65.71|
|ba78997921bbcdc13...|                  1|credit_card|           8|                 1|  107.78|
|42fdf880ba16b47b5...|                  1|credit_card|           2|                 0|  128.45|
+--------------------+-------------------+-----------+------------+------------------+--------+
only showing top 5 rows



### cast()

In [15]:
# Importando função e tipos primitivos
from pyspark.sql.functions import cast, year, month, dayofmonth
from pyspark.sql.types import DateType, IntegerType

# Referenciando expressões de transformação
ano_compra = year(col("order_purchase_timestamp").cast(DateType()))
mes_compra = month(col("order_purchase_timestamp").cast(DateType()))
dia_compra = dayofmonth(col("order_purchase_timestamp").cast(DateType()))

# Transformando datas de base de pedidos
df_orders_prep = df_orders.select(
    col("order_id").alias("id_pedido"),
    col("order_purchase_timestamp").cast(DateType()).alias("dt_compra"),
    ano_compra.alias("ano_compra"),
    mes_compra.alias("mes_compra"),
    dia_compra.alias("dia_compra"),
    ((ano_compra * 100) + mes_compra).cast(IntegerType()).alias("anomes_compra"),
    expr("cast(date_format(order_purchase_timestamp, 'yyyymmdd') AS INT) AS anomesdia_compra")
)

# Visualizando dados
df_orders_prep.show(5)

+--------------------+----------+----------+----------+----------+-------------+----------------+
|           id_pedido| dt_compra|ano_compra|mes_compra|dia_compra|anomes_compra|anomesdia_compra|
+--------------------+----------+----------+----------+----------+-------------+----------------+
|e481f51cbdc54678b...|2017-10-02|      2017|        10|         2|       201710|        20175602|
|53cdb2fc8bc7dce0b...|2018-07-24|      2018|         7|        24|       201807|        20184124|
|47770eb9100c2d0c4...|2018-08-08|      2018|         8|         8|       201808|        20183808|
|949d5b44dbf5de918...|2017-11-18|      2017|        11|        18|       201711|        20172818|
|ad21c59c0840e6cb8...|2018-02-13|      2018|         2|        13|       201802|        20181813|
+--------------------+----------+----------+----------+----------+-------------+----------------+
only showing top 5 rows



___

## Operações em Registros

### where()

In [16]:
# Filtrando clientes de uma cidade em específico
city_filter = col("customer_city") == "campinas"

# Criando consulta
df_customers_sbc = df_customers.selectExpr(
    "customer_id AS id_cliente",
    "customer_city AS cidade_cliente",
    "customer_state AS estado_cliente"
).where(city_filter)
#.where(expr("customer_city = 'campinas'"))

# Visualizando resultado
df_customers_sbc.show(5, truncate=False)

+--------------------------------+--------------+--------------+
|id_cliente                      |cidade_cliente|estado_cliente|
+--------------------------------+--------------+--------------+
|4f2d8ab171c80ec8364f7c12e35b23ad|campinas      |SP            |
|4ad4f929392158a3bb76b3ec02a751b2|campinas      |SP            |
|ccb8e120e8af0bbf5a1daa2f21984d7b|campinas      |SP            |
|fb04e5eb553d32e040a1a83cf436a65c|campinas      |SP            |
|5cb2307ae35b44c9dd90f76c649b99b5|campinas      |SP            |
+--------------------------------+--------------+--------------+
only showing top 5 rows



### sort() e orderBy()

In [17]:
# Retornando itens mais caros
df_expensive_items = df_order_items.selectExpr(
    "product_id AS id_produto",
    "price AS vlr_produto",
    "freight_value AS vlr_frete"
).sort(["price", "freight_value"], ascending=[False, True])

# Visualizando dados
df_expensive_items.show(5)



+--------------------+-----------+---------+
|          id_produto|vlr_produto|vlr_frete|
+--------------------+-----------+---------+
|489ae2aa008f02150...|     6735.0|   194.31|
|69c590f7ffc7bf8db...|     6729.0|   193.21|
|1bdf5e6731585cf01...|     6499.0|   227.66|
|a6492cc69376c469a...|     4799.0|   151.34|
|c3ed642d592594bb6...|     4690.0|    74.34|
+--------------------+-----------+---------+
only showing top 5 rows



                                                                                

In [18]:
# Retornando itens mais baratos
df_cheap_items = df_order_items.selectExpr(
    "product_id AS id_produto",
    "price AS vlr_produto",
    "freight_value AS vlr_frete"
).orderBy("vlr_produto")

# Visualizando dados
df_cheap_items.show(5)

+--------------------+-----------+---------+
|          id_produto|vlr_produto|vlr_frete|
+--------------------+-----------+---------+
|8a3254bee785a526d...|       0.85|    18.23|
|8a3254bee785a526d...|       0.85|    18.23|
|8a3254bee785a526d...|       0.85|     22.3|
|05b515fdc76e888aa...|        1.2|     7.89|
|05b515fdc76e888aa...|        1.2|     7.89|
+--------------------+-----------+---------+
only showing top 5 rows



                                                                                

### groupBy() e agg()

In [19]:
# Importando funções
from pyspark.sql.functions import count, avg, max, min, round

# Retornando os produtos mais caros vendidos
df_items_prep = df_order_items.groupBy("product_id").agg(
    count("*").alias("qtd_vendas"),
    round(avg("price"), 2).alias("avg_price"),
    expr("max(price) AS max_price"),
    min("price").alias("min_price"),
    expr("round(max(price) - min(price), 2) AS dif_max_min_price")
).orderBy("qtd_vendas", ascending=False)

# Visualizando dados
df_items_prep.show(5, truncate=False)

                                                                                

+--------------------------------+----------+---------+---------+---------+-----------------+
|product_id                      |qtd_vendas|avg_price|max_price|min_price|dif_max_min_price|
+--------------------------------+----------+---------+---------+---------+-----------------+
|aca2eb7d00ea1a7b8ebd4e68314663af|527       |71.36    |109.9    |69.9     |40.0             |
|99a4788cb24856965c36a24e339b6058|488       |88.17    |89.9     |74.0     |15.9             |
|422879e10f46682990de24d770e7f83d|484       |54.91    |59.9     |49.0     |10.9             |
|389d119b48cf3043d311335e499d9c6b|392       |54.7     |59.9     |49.0     |10.9             |
|368c6c730842d78016ad823897a372db|388       |54.27    |59.9     |49.0     |10.9             |
+--------------------------------+----------+---------+---------+---------+-----------------+
only showing top 5 rows



In [20]:
# Média de pagamentos por tipo
df_payment_types = df_order_payments.groupBy("payment_type").avg("payment_value")

# Visualizando dados
df_payment_types.show()



+------------+------------------+
|payment_type|avg(payment_value)|
+------------+------------------+
|      boleto|145.03443540234633|
| not_defined|               0.0|
| credit_card|163.31902063935996|
|     voucher| 65.70335411255414|
|  debit_card|142.57017004578165|
+------------+------------------+



                                                                                

In [21]:
# Média de pagamentos por tipo
df_payment_types = df_order_payments\
    .groupBy("payment_type").agg(
        expr("count(1) AS qtd_pgtos"),
        expr("round(avg(payment_value), 2) AS avg_payment_value"),
        expr("round(sum(payment_value), 2) AS sum_payment_value")
    ).sort("qtd_pgtos", ascending=False)

# Visualizando dados
df_payment_types.show()



+------------+---------+-----------------+-----------------+
|payment_type|qtd_pgtos|avg_payment_value|sum_payment_value|
+------------+---------+-----------------+-----------------+
| credit_card|    76795|           163.32|    1.254208419E7|
|      boleto|    19784|           145.03|       2869361.27|
|     voucher|     5775|             65.7|        379436.87|
|  debit_card|     1529|           142.57|        217989.79|
| not_defined|        3|              0.0|              0.0|
+------------+---------+-----------------+-----------------+



                                                                                

___

## Juntando Dados

### union()

In [22]:
# Gerando DataFrames específicos para posterior união
df_perfurmaria = df_products.where(expr("product_category_name = 'perfumaria'"))
df_artes = df_products.where(expr("product_category_name = 'artes'"))

# Contabilizando registros
print(f'Registros de produtos na categoria PERFURMARIA: {df_perfurmaria.count()}')
print(f'Registros de produtos na categoria ARTES: {df_artes.count()}')

# Unindo DataFrames
df_perfurmaria_artes = df_perfurmaria.union(df_artes)
print(f'\nRegistros no DataFrame após união: {df_perfurmaria_artes.count()}')

Registros de produtos na categoria PERFURMARIA: 868
Registros de produtos na categoria ARTES: 55





Registros no DataFrame após união: 923


                                                                                

In [23]:
# Unindo DataFrames diferentes
df_orders.union(df_order_payments)

AnalysisException: Union can only be performed on tables with the same number of columns, but the first table has 8 columns and the second table has 5 columns;
'Union false, false
:- Relation [order_id#17,customer_id#18,order_status#19,order_purchase_timestamp#20,order_approved_at#21,order_delivered_carrier_date#22,order_delivered_customer_date#23,order_estimated_delivery_date#24] csv
+- Relation [order_id#81,payment_sequential#82,payment_type#83,payment_installments#84,payment_value#85] csv


In [None]:
# Unindo DataFrames diferentes com mesmo numero de colunas
df_incorrect_union = df_order_payments.union(df_customers)

# Aplicando algumas validações
df_order_payments.groupBy("payment_type").agg(expr("count(1) AS qtd")).show()

+------------+-----+
|payment_type|  qtd|
+------------+-----+
|      boleto|19784|
| not_defined|    3|
| credit_card|76795|
|     voucher| 5775|
|  debit_card| 1529|
+------------+-----+



### join()

In [None]:
# Analisando média de preços de produtos
df_product_category = df_order_items.join(
    other=df_products,
    on=[df_order_items.product_id == df_products.product_id],
    how="left"
).groupBy("product_category_name").agg(
    expr("count(1) AS qty_cat_sales"),
    expr("round(sum(price), 2) AS sum_cat_sales"),
    expr("round(avg(price), 2) AS avg_cat_price"),
    expr("max(price) AS max_cat_price"),
    expr("min(price) AS min_cat_price")
).orderBy("qty_cat_sales", ascending=False)

# Visualizando dados
df_product_category.show(10, truncate=False)

+----------------------+-------------+-------------+-------------+-------------+-------------+
|product_category_name |qty_cat_sales|sum_cat_sales|avg_cat_price|max_cat_price|min_cat_price|
+----------------------+-------------+-------------+-------------+-------------+-------------+
|cama_mesa_banho       |11115        |1036988.68   |93.3         |1999.98      |6.99         |
|beleza_saude          |9670         |1258681.34   |130.16       |3124.0       |1.2          |
|esporte_lazer         |8641         |988048.97    |114.34       |4059.0       |4.5          |
|moveis_decoracao      |8334         |729762.49    |87.56        |1899.0       |4.9          |
|informatica_acessorios|7827         |911954.32    |116.51       |3699.99      |3.9          |
|utilidades_domesticas |6964         |632248.66    |90.79        |6735.0       |3.06         |
|relogios_presentes    |5991         |1205005.68   |201.14       |3999.9       |8.99         |
|telefonia             |4545         |323667.53   

In [None]:
# Cruzando dados de múltiplos conjuntos
df_sales = df_order_items.join(
    other=df_products,
    on=[df_order_items.product_id == df_products.product_id],
    how='left'
).join(
    other=df_sellers,
    on=[df_order_items.seller_id == df_sellers.seller_id],
    how='left'
).selectExpr(
    "order_id AS id_pedido",
    "product_category_name AS categoria_produto",
    "seller_city AS cidade_vendedor",
    "seller_state AS estado_vendedor",
    "price AS vlr_venda",
    "freight_value AS vlr_frete"
)

# Visualizando resultado
df_sales.show(5, truncate=False)

+--------------------------------+------------------+---------------+---------------+---------+---------+
|id_pedido                       |categoria_produto |cidade_vendedor|estado_vendedor|vlr_venda|vlr_frete|
+--------------------------------+------------------+---------------+---------------+---------+---------+
|00010242fe8c5a6d1ba2dd792cb16214|cool_stuff        |volta redonda  |SP             |58.9     |13.29    |
|00018f77f2f0320c557190d7a144bdd3|pet_shop          |sao paulo      |SP             |239.9    |19.93    |
|000229ec398224ef6ca0657da4fc703e|moveis_decoracao  |borda da mata  |MG             |199.0    |17.87    |
|00024acbcdf0a6daa1e931b038114c75|perfumaria        |franca         |SP             |12.99    |12.79    |
|00042b26cf59d7ce69dfabb4e55b4fd9|ferramentas_jardim|loanda         |PR             |199.9    |18.14    |
+--------------------------------+------------------+---------------+---------------+---------+---------+
only showing top 5 rows



## SparkSQL

In [None]:
# Criando tabelas temporárias
df_orders.createOrReplaceTempView("tbl_orders")
df_order_items.createOrReplaceTempView("tbl_order_items")

# Gerando visão história de vendas
df_ecommerce_hist = spark.sql("""
    SELECT
        anomes_pedido,
        anomesdia_pedido,
        count(DISTINCT id_pedido) AS qtd_pedidos,
        count(id_item) AS qtd_produtos,
        round(sum(vlr_item), 2) AS soma_vendas,
        round(sum(vlr_frete), 2) AS soma_frete

    FROM (
        SELECT
            o.order_id AS id_pedido,
            i.product_id AS id_item,
            i.price AS vlr_item,
            i.freight_value AS vlr_frete,
            date_format(o.order_purchase_timestamp, 'yyyyMM') AS anomes_pedido,
            date_format(o.order_purchase_timestamp, 'yyyyMMdd') AS anomesdia_pedido

        FROM tbl_orders AS o

        LEFT JOIN tbl_order_items AS i
            ON o.order_id = i.order_id

        WHERE lower(o.order_status) = 'delivered'
    )

    GROUP BY
        anomes_pedido,
        anomesdia_pedido
    
    ORDER BY anomesdia_pedido ASC

""")

# Visualizando conjunto
df_ecommerce_hist.show(15)




+-------------+----------------+-----------+------------+-----------+----------+
|anomes_pedido|anomesdia_pedido|qtd_pedidos|qtd_produtos|soma_vendas|soma_frete|
+-------------+----------------+-----------+------------+-----------+----------+
|       201609|        20160915|          1|           3|     134.97|      8.49|
|       201610|        20161003|          7|           7|     441.98|    117.55|
|       201610|        20161004|         54|          63|    8595.89|   1225.53|
|       201610|        20161005|         35|          48|    6169.77|   1039.73|
|       201610|        20161006|         41|          47|    5889.96|    908.94|
|       201610|        20161007|         38|          44|    6075.35|    749.11|
|       201610|        20161008|         36|          41|    7592.89|    883.04|
|       201610|        20161009|         20|          26|     2399.7|    504.49|
|       201610|        20161010|         34|          37|    3159.57|    737.16|
|       201612|        20161

                                                                                

___