In [None]:
from datetime import datetime, timezone, timedelta
from typing import Dict, List, Optional, Tuple

import logging

import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from delta.tables import DeltaTable
from pyspark.sql.types import StructType, StructField, TimestampType, StringType

CATALOG = ""

VOLUME_CATALOG = "main"
VOLUME_SCHEMA = "engenharia_dados"
VOLUME_NAME = "aviacao_landing"

LANDING_CSV_BASE_PATH = f"/Volumes/{VOLUME_CATALOG}/{VOLUME_SCHEMA}/{VOLUME_NAME}/aviacao/landing"

BRONZE_SCHEMA = "aviacao_bronze"
ORIGEM_SISTEMA = "postgres-aviacao"

LATE_ARRIVAL_LOOKBACK_SECONDS = 300
USE_INFER_SCHEMA = True

TABLE_CONFIGS: Dict[str, Dict] = {
    "companhias_aereas": {"schema": "aviacao", "business_key": ["id"]},
    "modelos_avioes": {"schema": "aviacao", "business_key": ["id"]},
    "aeroportos": {"schema": "aviacao", "business_key": ["id"]},
    "aeronaves": {"schema": "aviacao", "business_key": ["id"]},
    "funcionarios": {"schema": "aviacao", "business_key": ["id"]},
    "clientes": {"schema": "aviacao", "business_key": ["id"]},
    "voos": {"schema": "aviacao", "business_key": ["id"]},
    "reservas": {"schema": "aviacao", "business_key": ["id"]},
    "bilhetes": {"schema": "aviacao", "business_key": ["id"]},
    "bagagens": {"schema": "aviacao", "business_key": ["id"]},
    "manutencoes": {"schema": "aviacao", "business_key": ["id"]},
    "tripulacao_voo": {"schema": "aviacao", "business_key": ["id"]},
}

TABLE_SCHEMAS: Dict[str, StructType] = {
    # se quiser evitar inferSchema, defina aqui os schemas da landing por tabela
    # "companhias_aereas": StructType([...]),
}

logger = logging.getLogger("aviacao_bronze")
if not logger.handlers:
    handler = logging.StreamHandler()
    formatter = logging.Formatter(
        "%(asctime)s [%(levelname)s] %(name)s - %(message)s"
    )
    handler.setFormatter(formatter)
    logger.addHandler(handler)
logger.setLevel(logging.INFO)


def qname(schema: str, table: str) -> str:
    if CATALOG:
        return f"{CATALOG}.{schema}.{table}"
    return f"{schema}.{table}"


def now_utc():
    return datetime.now(timezone.utc)


def init_schema(schema_name: str) -> None:
    schema_qualified = f"{CATALOG}.{schema_name}" if CATALOG else schema_name
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_qualified}")


def bronze_table_name(table_name: str) -> str:
    return qname(BRONZE_SCHEMA, f"{table_name}_changelog")


def get_landing_schema(table_name: str) -> Optional[StructType]:
    return TABLE_SCHEMAS.get(table_name)


def path_exists(path: str) -> bool:
    try:
        files = dbutils.fs.ls(path)
        return len(files) > 0
    except Exception:
        return False


def read_landing_incremental(table_name: str) -> Tuple[Optional[DataFrame], int]:
    bronze_table = bronze_table_name(table_name)

    if spark.catalog.tableExists(bronze_table):
        last_data_ref = (
            spark.table(bronze_table)
            .agg(F.max("data_ref").alias("max_dr"))
            .collect()[0]["max_dr"]
        )
        if last_data_ref is not None:
            margin_ts = last_data_ref - timedelta(seconds=LATE_ARRIVAL_LOOKBACK_SECONDS)
            logger.info(
                f"[{table_name}] Último data_ref no Bronze: {last_data_ref} "
                f"(lookback aplicado: {margin_ts})"
            )
        else:
            margin_ts = None
    else:
        last_data_ref = None
        margin_ts = None
        logger.info(f"[{table_name}] Nenhuma tabela Bronze ainda; carga full da landing.")

    landing_path = f"{LANDING_CSV_BASE_PATH}/{table_name}"

    if not path_exists(landing_path):
        logger.info(f"[{table_name}] Nenhum arquivo encontrado na landing em {landing_path}.")
        return None, 0

    reader = (
        spark.read
        .option("header", "true")
        .option("delimiter", ";")
    )

    schema = get_landing_schema(table_name)
    if schema is not None:
        reader = reader.schema(schema)
    elif not USE_INFER_SCHEMA:
        raise ValueError(
            f"Schema fixo não configurado para {table_name} e USE_INFER_SCHEMA=False."
        )
    else:
        logger.warning(
            f"[{table_name}] Usando inferSchema na landing. "
            f"Configure TABLE_SCHEMAS para produção."
        )
        reader = reader.option("inferSchema", "true")

    df = reader.csv(landing_path)

    if "data_ref" not in df.columns:
        raise ValueError(f"[{table_name}] Coluna data_ref não encontrada na landing.")

    df = df.withColumn("data_ref", F.col("data_ref").cast("timestamp"))

    total_raw = df.count()
    logger.info(f"[{table_name}] Registros brutos na landing (antes de filtros): {total_raw}")

    df_valid = df.filter(F.col("data_ref").isNotNull())
    total_valid = df_valid.count()
    null_count = total_raw - total_valid
    if null_count > 0:
        logger.warning(
            f"[{table_name}] {null_count} registros descartados por data_ref nula."
        )

    if margin_ts is not None:
        df_valid = df_valid.filter(F.col("data_ref") > F.lit(margin_ts))

    if "change_op" in df_valid.columns:
        dist = df_valid.groupBy("change_op").count().collect()
        for row in dist:
            logger.info(
                f"[{table_name}] (landing filtrada) change_op={row['change_op']} "
                f"-> {row['count']} registros"
            )

    business_key_cols = TABLE_CONFIGS[table_name]["business_key"]
    df_valid = df_valid.dropDuplicates(business_key_cols + ["data_ref"])
    total_final = df_valid.count()
    logger.info(
        f"[{table_name}] Registros após filtros + deduplicação (landing incremental): "
        f"{total_final}"
    )

    if total_final == 0:
        return None, 0

    return df_valid, total_final


def ensure_bronze_changelog_table(table_name: str, df_sample: DataFrame) -> None:
    bronze_table = bronze_table_name(table_name)

    if spark.catalog.tableExists(bronze_table):
        return

    logger.info(f"[{table_name}] Criando tabela Bronze change-log vazia: {bronze_table}")

    base_schema: StructType = df_sample.schema
    existing_cols = {f.name for f in base_schema.fields}

    metadata_fields = []

    if "bronze_load_ts" not in existing_cols:
        metadata_fields.append(
            StructField("bronze_load_ts", TimestampType(), nullable=False)
        )

    if "bronze_batch_id" not in existing_cols:
        metadata_fields.append(
            StructField("bronze_batch_id", StringType(), nullable=False)
        )

    if "origem_sistema" not in existing_cols:
        metadata_fields.append(
            StructField("origem_sistema", StringType(), nullable=False)
        )

    bronze_schema = StructType(list(base_schema.fields) + metadata_fields)

    empty_df = spark.createDataFrame([], bronze_schema)

    (
        empty_df.write
        .mode("overwrite")
        .format("delta")
        .saveAsTable(bronze_table)
    )

    logger.info(f"[{table_name}] Tabela Bronze criada como append-only change-log.")


def merge_into_bronze_changelog(
    df: DataFrame,
    table_name: str,
    business_key_cols: List[str],
    batch_id: str,
) -> int:
    bronze_table = bronze_table_name(table_name)

    ensure_bronze_changelog_table(table_name, df)

    df_enriched = (
        df
        .withColumn("bronze_load_ts", F.lit(now_utc()))
        .withColumn("bronze_batch_id", F.lit(batch_id))
        .withColumn("origem_sistema", F.lit(ORIGEM_SISTEMA).cast("string"))
    )

    dedup_cols = business_key_cols + ["data_ref"]
    df_enriched = df_enriched.dropDuplicates(dedup_cols)

    total_to_merge = df_enriched.count()
    logger.info(f"[{table_name}] Registros a serem mesclados no Bronze: {total_to_merge}")

    if total_to_merge == 0:
        return 0

    if "change_op" in df_enriched.columns:
        dist = df_enriched.groupBy("change_op").count().collect()
        for row in dist:
            logger.info(
                f"[{table_name}] (bronze) change_op={row['change_op']} "
                f"-> {row['count']} registros"
            )

    cond_parts = [f"tgt.{col} = src.{col}" for col in business_key_cols]
    cond_parts.append("tgt.data_ref = src.data_ref")
    merge_condition = " AND ".join(cond_parts)

    delta_tbl = DeltaTable.forName(spark, bronze_table)

    (
        delta_tbl.alias("tgt")
        .merge(
            df_enriched.alias("src"),
            merge_condition
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

    logger.info(f"[{table_name}] MERGE em Bronze change-log concluído.")
    return total_to_merge


def process_table(table_name: str) -> None:
    if table_name not in TABLE_CONFIGS:
        raise ValueError(f"Tabela '{table_name}' não está configurada em TABLE_CONFIGS.")

    business_key_cols = TABLE_CONFIGS[table_name]["business_key"]

    logger.info(f"================ INÍCIO BRONZE: {table_name} ================")

    df_src, total_inc = read_landing_incremental(table_name)

    if df_src is None or total_inc == 0:
        logger.info(f"[{table_name}] Nenhum dado incremental para processar.")
        logger.info(f"================ FIM BRONZE (sem dados): {table_name} ================")
        return

    batch_id = datetime.now().strftime("%Y%m%d%H%M%S")

    merged = merge_into_bronze_changelog(df_src, table_name, business_key_cols, batch_id)

    logger.info(
        f"[{table_name}] Resumo Bronze: lidos_incremental={total_inc}, "
        f"mesclados_no_bronze={merged}, batch_id={batch_id}"
    )
    logger.info(f"================ FIM BRONZE: {table_name} ================")


def main(tables: Optional[List[str]] = None) -> None:
    init_schema(BRONZE_SCHEMA)

    if tables is None:
        tables = list(TABLE_CONFIGS.keys())

    for tbl in tables:
        process_table(tbl)


if __name__ == "__main__":
    main()
