## Consultado os arquivos na camada silver

## Criando diretório Bronze

In [0]:
BRONZE_PATH = "/Volumes/workspace/default/delta/bronze"
SILVER_PATH = "/Volumes/workspace/default/delta/silver"

# Criar diretórios se não existirem
dbutils.fs.mkdirs(SILVER_PATH)

In [0]:
display(dbutils.fs.ls(f"{BRONZE_PATH}/flight/"))

## Carregando as tabelas da camada bronze


In [0]:
# Carregar Bronze
df_voos_silver = spark.read.format("delta").load("dbfs:/Volumes/workspace/default/delta/bronze/flight/registro_voos")
df_codigos_silver = spark.read.format("delta").load("dbfs:/Volumes/workspace/default/delta/bronze/flight/codigo_voos/")


### Tratando os dados

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
print(f"Registros Bronze: {df_voos_silver.count():,}")
df_voos_silver.show(5)

In [0]:
# Verificar nulos antes da limpeza
print("=== Nulos por coluna ===")
df_voos_silver.select([
    count(when(col(c).isNull(), c)).alias(c) 
    for c in df_voos_silver.columns
]).show()



In [0]:
# Limpeza de dados caso tivessem dados nulos
df_delays_clean = df_voos_silver \
    .filter(col("delay").isNotNull()) \
    .filter(col("origin").isNotNull()) \
    .filter(col("destination").isNotNull()) \
    .filter(col("distance") > 0) \
    .dropDuplicates()

print(f"Registros após limpeza: {df_delays_clean.count():,}")

In [0]:
# Adicionar colunas derivadas
df_delays_clean = df_delays_clean \
    .withColumn("delay_minutes", col("delay").cast("integer")) \
    .withColumn("distance_km", round(col("distance") * 1.60934, 2)) \
    .withColumn("is_delayed", when(col("delay") > 15, 1).otherwise(0)) \
    .withColumn("delay_category", 
        when(col("delay") <= 0, "Na hora")
        .when(col("delay") <= 15, "Pouco atrazado")
        .when(col("delay") <= 60, "Atrazado")
        .otherwise("Muito atrazado")
    )

In [0]:
# Visualizar transformações
print("=== Dados transformados ===")
df_delays_clean.select(
    "date", "origin", "destination", "delay_minutes", 
    "distance_km", "delay_category"
).show(10)

## Explorando as tabelas

In [0]:
display(df_voos_silver.columns)


In [0]:
df_delays_clean.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save(f"{SILVER_PATH}/flight/registro_voos")

In [0]:
df_codigos_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save(f"{SILVER_PATH}/flight/codigos_voos")