In [None]:
from datetime import datetime, timezone
from typing import Dict, List, Optional

import logging

import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from delta.tables import DeltaTable
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, BooleanType
from pyspark.sql.window import Window

CATALOG = ""

VOLUME_CATALOG = "main"
VOLUME_SCHEMA = "engenharia_dados"
VOLUME_NAME = "aviacao_landing"

BRONZE_SCHEMA = "aviacao_bronze"
SILVER_SCHEMA = "aviacao_silver"
META_SCHEMA = "aviacao_meta"
ORIGEM_SISTEMA = "postgres-aviacao"

TABLE_CONFIGS: Dict[str, Dict] = {
    "companhias_aereas": {"schema": "aviacao", "business_key": ["id"]},
    "modelos_avioes": {"schema": "aviacao", "business_key": ["id"]},
    "aeroportos": {"schema": "aviacao", "business_key": ["id"]},
    "aeronaves": {"schema": "aviacao", "business_key": ["id"]},
    "funcionarios": {"schema": "aviacao", "business_key": ["id"]},
    "clientes": {"schema": "aviacao", "business_key": ["id"]},
    "voos": {"schema": "aviacao", "business_key": ["id"]},
    "reservas": {"schema": "aviacao", "business_key": ["id"]},
    "bilhetes": {"schema": "aviacao", "business_key": ["id"]},
    "bagagens": {"schema": "aviacao", "business_key": ["id"]},
    "manutencoes": {"schema": "aviacao", "business_key": ["id"]},
    "tripulacao_voo": {"schema": "aviacao", "business_key": ["id"]},
}

TABLE_SCHEMAS: Dict[str, StructType] = {}

logger = logging.getLogger("aviacao_silver")
if not logger.handlers:
    handler = logging.StreamHandler()
    formatter = logging.Formatter(
        "%(asctime)s [%(levelname)s] %(name)s - %(message)s"
    )
    handler.setFormatter(formatter)
    logger.addHandler(handler)
logger.setLevel(logging.INFO)

def qname(schema: str, table: str) -> str:
    if CATALOG:
        return f"{CATALOG}.{schema}.{table}"
    return f"{schema}.{table}"


def now_utc():
    return datetime.now(timezone.utc)


def init_schema(schema_name: str) -> None:
    schema_qualified = f"{CATALOG}.{schema_name}" if CATALOG else schema_name
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_qualified}")


def silver_dim_table_name(table_name: str) -> str:
    return qname(SILVER_SCHEMA, f"dim_{table_name}")


def meta_silver_watermark_table() -> str:
    return qname(META_SCHEMA, "silver_bronze_watermark")

def init_meta_silver_watermark() -> None:
    init_schema(META_SCHEMA)
    meta_table = meta_silver_watermark_table()
    spark.sql(f"""
        CREATE TABLE IF NOT EXISTS {meta_table} (
            tabela STRING NOT NULL,
            ultima_data_ref TIMESTAMP NOT NULL,
            ultima_execucao_ts TIMESTAMP NOT NULL
        )
        USING DELTA
    """)
    try:
        spark.sql(f"""
            ALTER TABLE {meta_table}
            ADD CONSTRAINT pk_silver_bronze_watermark PRIMARY KEY (tabela)
        """)
    except Exception:
        logger.info(
            f"[META] Constraint pk_silver_bronze_watermark já existe em {meta_table}, "
            f"ignorando criação."
        )


def get_last_processed_ts(table_name: str) -> Optional[datetime]:
    meta_table = meta_silver_watermark_table()
    if not spark.catalog.tableExists(meta_table):
        return None

    df = spark.table(meta_table).filter(F.col("tabela") == table_name)
    if df.limit(1).count() == 0:
        return None

    row = df.agg(F.max("ultima_data_ref").alias("max_ts")).collect()[0]
    return row["max_ts"]


def update_watermark(table_name: str, last_bronze_ts: datetime) -> None:
    meta_table = meta_silver_watermark_table()
    ts_str = last_bronze_ts.strftime("%Y-%m-%d %H:%M:%S")

    spark.sql(f"""
        MERGE INTO {meta_table} AS tgt
        USING (
            SELECT
                '{table_name}' AS tabela,
                TIMESTAMP '{ts_str}' AS ultima_data_ref,
                current_timestamp() AS ultima_execucao_ts
        ) AS src
        ON tgt.tabela = src.tabela
        WHEN MATCHED THEN UPDATE SET
            tgt.ultima_data_ref    = src.ultima_data_ref,
            tgt.ultima_execucao_ts = src.ultima_execucao_ts
        WHEN NOT MATCHED THEN INSERT (tabela, ultima_data_ref, ultima_execucao_ts)
        VALUES (src.tabela, src.ultima_data_ref, src.ultima_execucao_ts)
    """)

    logger.info(f"[{table_name}] Watermark atualizado para {ts_str}.")

def bronze_changelog_table_name(table_name: str) -> str:
    return qname(BRONZE_SCHEMA, f"{table_name}_changelog")


def read_bronze_incremental(table_name: str, last_processed_ts: Optional[datetime]) -> DataFrame:
    bronze_table = bronze_changelog_table_name(table_name)

    if not spark.catalog.tableExists(bronze_table):
        raise ValueError(f"A tabela Bronze {bronze_table} não existe.")

    df = spark.table(bronze_table)

    if last_processed_ts is not None:
        df = df.filter(F.col("bronze_load_ts") > F.lit(last_processed_ts))

    return df

def ensure_silver_dim_table(table_name: str, df_sample: DataFrame) -> None:
    """
    Cria dim_<tabela> copiando o schema da Bronze + colunas SCD2:
      vigencia_inicio, vigencia_fim, is_current, aud_dh_criacao, aud_dh_alteracao, attr_hash
    """
    init_schema(SILVER_SCHEMA)
    silver_table = silver_dim_table_name(table_name)

    if spark.catalog.tableExists(silver_table):
        logger.info(f"[{table_name}] Dimensão Silver {silver_table} já existe.")
        return

    logger.info(f"[{table_name}] Criando dimensão Silver {silver_table} baseada no schema da Bronze.")

    base_schema: StructType = df_sample.schema
    existing_cols = {f.name for f in base_schema.fields}

    metadata_fields: List[StructField] = []

    if "vigencia_inicio" not in existing_cols:
        metadata_fields.append(
            StructField("vigencia_inicio", TimestampType(), nullable=True)
        )
    if "vigencia_fim" not in existing_cols:
        metadata_fields.append(
            StructField("vigencia_fim", TimestampType(), nullable=True)
        )
    if "is_current" not in existing_cols:
        metadata_fields.append(
            StructField("is_current", BooleanType(), nullable=True)
        )
    if "aud_dh_criacao" not in existing_cols:
        metadata_fields.append(
            StructField("aud_dh_criacao", TimestampType(), nullable=True)
        )
    if "aud_dh_alteracao" not in existing_cols:
        metadata_fields.append(
            StructField("aud_dh_alteracao", TimestampType(), nullable=True)
        )
    if "attr_hash" not in existing_cols:
        metadata_fields.append(
            StructField("attr_hash", StringType(), nullable=True)
        )

    silver_schema = StructType(list(base_schema.fields) + metadata_fields)
    empty_df = spark.createDataFrame([], silver_schema)

    (
        empty_df.write
        .mode("overwrite")
        .format("delta")
        .saveAsTable(silver_table)
    )

    logger.info(f"[{table_name}] Dimensão Silver {silver_table} criada como SCD2.")

def process_table(table_name: str) -> None:
    if table_name not in TABLE_CONFIGS:
        raise ValueError(f"Tabela '{table_name}' não está configurada em TABLE_CONFIGS.")

    business_key_cols = TABLE_CONFIGS[table_name]["business_key"]

    logger.info(f"================ INÍCIO SILVER (SCD2): {table_name} ================")

    last_ts = get_last_processed_ts(table_name)
    logger.info(f"[{table_name}] Último bronze_load_ts processado: {last_ts}")

    df_bronze = read_bronze_incremental(table_name, last_ts)

    if df_bronze.limit(1).count() == 0:
        logger.info(f"[{table_name}] Nenhum dado incremental na Bronze para processar.")
        logger.info(f"================ FIM SILVER (sem dados): {table_name} ================")
        return

    ensure_silver_dim_table(table_name, df_bronze)
    silver_table = silver_dim_table_name(table_name)

    stats = df_bronze.agg(
        F.min("bronze_load_ts").alias("min_bronze_ts"),
        F.max("bronze_load_ts").alias("max_bronze_ts"),
    ).collect()[0]
    max_bronze_ts = stats["max_bronze_ts"]
    logger.info(
        f"[{table_name}] Faixa incremental Bronze - bronze_load_ts: "
        f"[{stats['min_bronze_ts']}, {stats['max_bronze_ts']}]"
    )

    w = Window.partitionBy(*[F.col(c) for c in business_key_cols]).orderBy(
        F.col("data_ref").desc(),
        F.col("bronze_load_ts").desc(),
    )

    df_changes = (
        df_bronze
        .withColumn("row_number", F.row_number().over(w))
        .filter(F.col("row_number") == 1)
        .drop("row_number")
    )

    technical_cols = set(business_key_cols + ["data_ref", "bronze_load_ts", "origem_sistema", "change_op"])
    attr_cols = [c for c in df_changes.columns if c not in technical_cols]

    df_changes = df_changes.withColumn(
        "attr_hash",
        F.sha2(F.concat_ws("||", *[F.col(c).cast("string") for c in attr_cols]), 256)
    )

    logger.info(f"[{table_name}] Registros em df_changes: {df_changes.count()}")

    df_silver_current = spark.table(silver_table).filter(F.col("is_current") == True)

    join_expr = [
        F.col(f"chg.{c}") == F.col(f"dim.{c}")
        for c in business_key_cols
    ]

    df_join = df_changes.alias("chg").join(
        df_silver_current.alias("dim"),
        on=join_expr,
        how="left",
    )

    delta_dim = DeltaTable.forName(spark, silver_table)

    df_to_close = (
        df_join
        .filter(
            F.col("dim.attr_hash").isNotNull() &
            (F.col("dim.attr_hash") != F.col("chg.attr_hash"))
        )
        .select(
            *[F.col(f"chg.{c}").alias(c) for c in business_key_cols],
            F.col("chg.data_ref").alias("data_ref"),
        )
        .dropDuplicates(business_key_cols)
    )

    if df_to_close.limit(1).count() == 0:
        logger.info(f"[{table_name}] Nenhum registro atual na Silver para fechar (sem mudanças de atributo).")
    else:
        qtd_close = df_to_close.count()
        logger.info(f"[{table_name}] Fechando {qtd_close} registros atuais na Silver.")

        cond_parts = [f"dim.{c} = chg.{c}" for c in business_key_cols]
        cond_parts.append("dim.is_current = true")
        merge_condition_close = " AND ".join(cond_parts)

        (
            delta_dim.alias("dim")
            .merge(
                df_to_close.alias("chg"),
                merge_condition_close,
            )
            .whenMatchedUpdate(
                set={
                    "vigencia_fim": "chg.data_ref - INTERVAL 1 MICROSECOND",
                    "is_current": "false",
                    "aud_dh_alteracao": "current_timestamp()",
                }
            )
            .execute()
        )

    df_to_insert = (
        df_join
        .filter(
            F.col("dim.attr_hash").isNull() |
            (F.col("dim.attr_hash") != F.col("chg.attr_hash"))
        )
        .select("chg.*")
    )

    if df_to_insert.limit(1).count() == 0:
        logger.info(f"[{table_name}] Nenhum registro novo/alterado para inserir na Silver.")
    else:
        qtd_insert = df_to_insert.count()
        logger.info(f"[{table_name}] Inserindo {qtd_insert} registros (novos/alterados) na Silver.")

        dim_schema = spark.table(silver_table).schema
        meta_cols = {"vigencia_inicio", "vigencia_fim", "is_current", "aud_dh_criacao", "aud_dh_alteracao", "attr_hash"}
        base_cols = [f.name for f in dim_schema.fields if f.name not in meta_cols]

        cond_parts = [f"dim.{c} = chg.{c}" for c in business_key_cols]
        cond_parts.append("dim.is_current = true")
        merge_condition_insert = " AND ".join(cond_parts)

        values_map = {col: f"chg.{col}" for col in base_cols if col in df_to_insert.columns}

        values_map.update({
            "vigencia_inicio": "chg.data_ref",
            "vigencia_fim": "TIMESTAMP '9999-12-31 23:59:59'",
            "is_current": "true",
            "aud_dh_criacao": "current_timestamp()",
            "aud_dh_alteracao": "current_timestamp()",
            "attr_hash": "chg.attr_hash",
        })

        (
            delta_dim.alias("dim")
            .merge(
                df_to_insert.alias("chg"),
                merge_condition_insert,
            )
            .whenNotMatchedInsert(
                values=values_map
            )
            .execute()
        )

    update_watermark(table_name, max_bronze_ts)

    logger.info(f"================ FIM SILVER (SCD2): {table_name} ================")


def main(tables: Optional[List[str]] = None) -> None:
    logger.info("Iniciando o processamento da camada Silver (SCD2)...")

    init_schema(SILVER_SCHEMA)
    init_meta_silver_watermark()

    if tables is None:
        tables = list(TABLE_CONFIGS.keys())

    for tbl in tables:
        process_table(tbl)


if __name__ == "__main__":
    main()
