In [0]:

# Fontes e bibliotecas
from pyspark.sql.functions import input_file_name
from pyspark.sql.types import *
import urllib.request # Requisição para baixar arquivo na internet


# Parâmetros
# Tinha um erro no link do arquivo: estava como 'sour-ce' e na verdade é 'source'
s3_path = "s3a://data-architect-test-source/order.json.gz"
bronze_path = "dbfs:/mnt/bronze/raw_order"

# Baixar arquivo
#urllib.request.urlretrieve(url, local_path)

# Leitura do JSON
df = spark.read.json(s3_path, multiLine=True)

# Verificando
df.columns        
df.schema.simpleString()  
display(df)

# Escrevendo no DBFS
# Não vou salvar nada em camada Bronze
# df.write.format("delta").mode("overwrite").save(bronze_path)

# Registro como tabela para manipulação SQL
# spark.sql(f"""
#          CREATE TABLE IF NOT EXISTS raw_order
#          USING DELTA
#          LOCATION '{bronze_path}'
#          """)

In [0]:
# Código para estudar o campo item
from pyspark.sql.functions import explode, col, from_json
from pyspark.sql.types import *

# Leitura do JSON bruto
s3_path = "s3a://data-architect-test-source/order.json.gz"
df = spark.read.json(s3_path, multiLine=True)

# Define o schema do campo "items", incluindo garnishItems
garnish_schema = StructType([
    StructField("name", StringType(), True),
    StructField("quantity", DoubleType(), True),
    StructField("unitPrice", StructType([StructField("value", StringType(), True)]), True),
    StructField("totalValue", StructType([StructField("value", StringType(), True)]), True),
    StructField("categoryName", StringType(), True),
    StructField("externalId", StringType(), True)
])

item_schema = ArrayType(StructType([
    StructField("name", StringType(), True),
    StructField("quantity", DoubleType(), True),
    StructField("unitPrice", StructType([StructField("value", StringType(), True)]), True),
    StructField("totalValue", StructType([StructField("value", StringType(), True)]), True),
    StructField("garnishItems", ArrayType(garnish_schema), True)
]))

# Converte o campo items de string para array de structs
df_com_array = df.withColumn("items_parsed", from_json("items", item_schema))

# Explode os items
df_items_exploded = df_com_array.select(
    "order_id",
    explode("items_parsed").alias("item")
)

# Explode os garnishItems dentro de cada item
df_garnish = df_items_exploded.select(
    "order_id",
    col("item.name").alias("item_name"),
    col("item.quantity").alias("item_quantity"),
    col("item.unitPrice.value").alias("item_unit_value"),
    col("item.totalValue.value").alias("item_total_value"),
    explode(col("item.garnishItems")).alias("garnish")
)

# Garnish
df_garnish_final = df_garnish.select(
    "order_id",
    "item_name",
    "item_quantity",
    "item_unit_value",
    "item_total_value",
    col("garnish.name").alias("garnish_name"),
    col("garnish.quantity").alias("garnish_quantity"),
    col("garnish.unitPrice.value").alias("garnish_unit_value"),
    col("garnish.totalValue.value").alias("garnish_total_value"),
    col("garnish.categoryName").alias("garnish_category")
)

display(df_garnish_final)



In [0]:
# Código para verificar em SQL
from pyspark.sql.functions import input_file_name
from pyspark.sql.types import *

# Avaliar um caso em específico
df = spark.read.option("multiLine", True).json("s3a://data-architect-test-source/order.json.gz")

# Criação da view temporária
df.createOrReplaceTempView("orders_raw")

# SQL
resultado = spark.sql("""
  SELECT min(order_created_at),max(order_created_at)
  FROM orders_raw
  --where order_id = 
  -- 'a60c8379657ab06d6324d045f3881e348979f5a41c6d98220a12d4ad3cb2f8ed' -- Verificação de valores altos
  --('0000ef1aa26353a390e8b661aa824da157ed12d3c1d48c9835b962559008a011',
  --'0005cd36901075f2f64b4a2814c3354770fb2601f779f3a0ab7e2a72e876d73d') -- Verificação de duplicidades
  --'653d87f714e6e40260de58014bfe284bb8528852b63cae033203f26ebd58f786' -- Teste para encontrar o customer_id
  --'4ff64b33b272c1886df21b63272220af6a82d1667dba70dad201810d98608dd8' 
  
""")

display(resultado)

In [0]:
# Verificação de dados únicos
from pyspark.sql.functions import countDistinct, count
from pyspark.sql.functions import input_file_name
from pyspark.sql.types import *

s3_path = "s3a://data-architect-test-source/order.json.gz"
bronze_path = "dbfs:/mnt/bronze/raw_order"

# Leitura do JSON
df = spark.read.json(s3_path, multiLine=True)

# customer_id
total_customer_id = df.select("customer_id").count()
distinct_customer_id = df.select("customer_id").agg(countDistinct("customer_id").alias("distinct")).collect()[0]["distinct"]

# order_id
total_order_id = df.select("order_id").count()
distinct_order_id = df.select("order_id").agg(countDistinct("order_id").alias("distinct")).collect()[0]["distinct"]

# Resultado
print(f"Total de customer_id: {total_customer_id}")
print(f"Customer_id distintos: {distinct_customer_id}")
print(f"Total de order_id: {total_order_id}")
print(f"Order_id distintos: {distinct_order_id}")
