In [0]:
# ============================================================
# GOLD – Consolidação dos Metadados PRODES
# ============================================================
# e_tbras_xml_gold_valor (GOLD)
from pyspark.sql import functions as F

# ================================================================
# 1. Parâmetros
# ================================================================
catalog        = dbutils.widgets.get("catalog")
schema_in      = dbutils.widgets.get("schema_in")
schema_out     = dbutils.widgets.get("schema_out")
table_in       = dbutils.widgets.get("table_in")
table_out      = dbutils.widgets.get("table_out")
data_ref_carga = dbutils.widgets.get("data_ref_carga")  # usado APENAS p/ gravação

if not (catalog and schema_in and schema_out and table_in and table_out and data_ref_carga):
    raise ValueError("Parâmetros obrigatórios ausentes.")

silver_table = f"{catalog}.{schema_in}.{table_in}"
print("Lendo Silver:", silver_table)

In [0]:
# ================================================================
# 2. Identificar partição mais recente
# ================================================================
max_dt = (
    spark.table(silver_table)
    .select(F.max("data_ref_carga").alias("max_date"))
    .collect()[0]["max_date"]
)

if not max_dt:
    dbutils.notebook.exit("NO_DATA")

print("Partição mais recente detectada:", max_dt)

In [0]:
# ================================================================
# 3. Ler Silver da partição mais recente
# ================================================================
df = (
    spark.table(silver_table)
    .filter(F.col("data_ref_carga") == max_dt)
)

if df.count() == 0:
    dbutils.notebook.exit("NO_DATA")

In [0]:
# ================================================================
# 4. Lógica GOLD (sua transformação completa)
# ================================================================

# --- bbox area ---
df = df.withColumn(
    "bbox_area_km2",
    F.when(
        F.col("northBoundLatitude").isNotNull() & F.col("southBoundLatitude").isNotNull() &
        F.col("eastBoundLongitude").isNotNull() & F.col("westBoundLongitude").isNotNull(),
        (F.abs(F.col("northBoundLatitude") - F.col("southBoundLatitude")) *
         F.abs(F.col("eastBoundLongitude") - F.col("westBoundLongitude")) * 111 * 111)
    )
)

# --- bbox centers ---
df = df.withColumn(
    "bbox_center_lat",
    F.when(
        F.col("northBoundLatitude").isNotNull() & F.col("southBoundLatitude").isNotNull(),
        (F.col("northBoundLatitude") + F.col("southBoundLatitude")) / 2
    )
)

df = df.withColumn(
    "bbox_center_lon",
    F.when(
        F.col("eastBoundLongitude").isNotNull() & F.col("westBoundLongitude").isNotNull(),
        (F.col("eastBoundLongitude") + F.col("westBoundLongitude")) / 2
    )
)

# --- emails e keywords ---
df = df.withColumn("all_emails",   F.expr("filter(emails, x -> x is not null)"))
df = df.withColumn("first_email",  F.expr("element_at(all_emails, 1)"))
df = df.withColumn("all_keywords", F.expr("filter(keywords, x -> x is not null)"))
df = df.withColumn("keyword_count", F.size("all_keywords"))

# --- quality score ---
df = df.withColumn(
    "quality_score",
    (
        F.when(F.col("abstract").isNotNull(), 1).otherwise(0) +
        F.when(F.col("purpose").isNotNull(), 1).otherwise(0) +
        F.when(F.col("title").isNotNull(), 1).otherwise(0) +
        F.when(F.col("publication_date").isNotNull(), 1).otherwise(0) +
        F.when(F.col("keyword_count") > 0, 1).otherwise(0) +
        F.when(F.col("first_email").isNotNull(), 1).otherwise(0)
    )
)

# --- seleção final GOLD ---
df_gold = df.select(
    "data_ref_carga",
    "title",
    "abstract",
    "purpose",
    "publication_date",
    "bbox_area_km2",
    "bbox_center_lat",
    "bbox_center_lon",
    "eastBoundLongitude",
    "westBoundLongitude",
    "northBoundLatitude",
    "southBoundLatitude",
    "all_keywords",
    "keyword_count",
    "first_email",
    "all_emails",
    "graphic_file",
    "graphic_description",
    "license_info",
    "quality_score"
)

In [0]:
# ================================================================
# 5. Gravação da GOLD (sempre grava NA DATA DO PARÂMETRO)
# ================================================================
gold_table = f"{catalog}.{schema_out}.{table_out}"

(
    df_gold.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(gold_table)
)

print("TABLE GOLD GRAVADA COM SUCESSO:", gold_table)
dbutils.notebook.exit("SUCCESS")