In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, countDistinct, count, lit, round

spark = SparkSession.builder.appName("session").getOrCreate()
spark.conf.set("spark.sql.adaptive.enabled", "true") #Otimização dinâmicas

# Convertendo os arquivos de CSV para Parquet

In [4]:
#Importando os arquivos em CSV com a função nativa do PySpark
olist_customers = spark.read.option("header", True).csv("olist_customers_dataset.csv")
olist_orders = spark.read.option("header", True).csv("olist_orders_dataset.csv")
olist_order_reviews = spark.read.option("header", True).csv("olist_order_reviews_dataset.csv")
olist_order_items = spark.read.option("header", True).csv("olist_order_items_dataset.csv")
olist_products = spark.read.option("header", True).csv("olist_products_dataset.csv")

In [5]:
#Convertendo os dfs para Parquet
olist_customers.write.mode("overwrite").parquet("olist_customers_dataset.parquet")
olist_customers.write.mode("overwrite").parquet("olist_customers_dataset.parquet")
olist_orders.write.mode("overwrite").parquet("olist_orders_dataset.parquet")
olist_order_reviews.write.mode("overwrite").parquet("olist_order_reviews_dataset.parquet")
olist_order_items.write.mode("overwrite").parquet("olist_order_items_dataset.parquet")
olist_products.write.mode("overwrite").parquet("olist_products_dataset.parquet")

In [7]:
#Importando novamente os dfs, mas com a versão em Parquet
olist_customers = spark.read.option("header", True).parquet("olist_customers_dataset.parquet")
olist_orders = spark.read.option("header", True).parquet("olist_orders_dataset.parquet")
olist_order_reviews = spark.read.option("header", True).parquet("olist_order_reviews_dataset.parquet")
olist_order_items = spark.read.option("header", True).parquet("olist_order_items_dataset.parquet")
olist_products = spark.read.option("header", True).parquet("olist_products_dataset.parquet")

# Verificando a volumetria de pedidos e produtos de cada pedido

In [37]:
produtos = olist_order_items.groupBy("order_id").agg(
    count("product_id").alias("total"),
    countDistinct("product_id").alias("distinto")
)

In [38]:
volumetria = produtos.groupBy("total").agg(count("*").alias("volumetria_pedidos"))
total_pedidos = volumetria.agg({"volumetria_pedidos": "sum"}).collect()[0][0]

In [39]:
volumetria = volumetria.withColumn(
    "volumetria_pedidos_perc",
    round((col("volumetria_pedidos") / lit(total_pedidos)) * 100, 2)
)
volumetria.orderBy("total").show(5)

+-----+------------------+-----------------------+
|total|volumetria_pedidos|volumetria_pedidos_perc|
+-----+------------------+-----------------------+
|    1|             88863|                  90.06|
|    2|              7516|                   7.62|
|    3|              1322|                   1.34|
|    4|               505|                   0.51|
|    5|               204|                   0.21|
+-----+------------------+-----------------------+
only showing top 5 rows



In [45]:
#Versão em SQL
olist_order_items.createOrReplaceTempView("order_items")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW produtos AS
    SELECT 
        order_id,
        COUNT(product_id) AS total,
        COUNT(DISTINCT product_id) AS distinto
    FROM order_items
    GROUP BY order_id
""")

resultado = spark.sql("""
    SELECT 
        total,
        COUNT(*) AS volumetria_pedidos,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS volumetria_pedidos_perc
    FROM produtos
    GROUP BY total
    ORDER BY total
""")

resultado.show(5, truncate=False)

+-----+------------------+-----------------------+
|total|volumetria_pedidos|volumetria_pedidos_perc|
+-----+------------------+-----------------------+
|1    |88863             |90.06                  |
|2    |7516              |7.62                   |
|3    |1322              |1.34                   |
|4    |505               |0.51                   |
|5    |204               |0.21                   |
+-----+------------------+-----------------------+
only showing top 5 rows

