In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

processing_timestamp = spark.sql("SELECT current_timestamp() as ts").collect()[0]['ts']

In [0]:
VALID_NOTES = [1, 2, 3, 4, 5]

In [0]:
TABLE_SOURCE = "v_credit.bronze.pesquisa_satisfacao"
TABLE_TARGET_VALIDOS = "v_credit.silver.tb_pesquisa"
TABLE_TARGET_INVALIDOS = "v_credit.silver.tb_pesquisa_invalidos"

In [0]:
df_bronze = spark.table(TABLE_SOURCE)

total_bronze = df_bronze.count()

In [0]:
df_limpo = (
    df_bronze
    .select(
        F.col("id_pesquisa").cast("bigint").alias("cd_pesquisa"),
        F.col("id_chamado").cast("bigint").alias("cd_chamado"),
        F.expr("try_cast(nota_atendimento as smallint)").alias("nu_nota"),
        F.col("ingestion_timestamp").alias("dt_ingestion"),
        F.coalesce(F.col("origem"), F.lit("sistema_pesquisa")).alias("dc_origem")
    )
    .dropDuplicates(["cd_pesquisa"])
    .withColumn("data_processamento", F.current_timestamp())
)

total_limpo = df_limpo.count()

In [0]:
df_validacao = (
    df_limpo
    .withColumn(
        "flag_pk_valida", 
        F.col("cd_pesquisa").isNotNull()
    )
    .withColumn(
        "flag_fk_valida", 
        F.col("cd_chamado").isNotNull()
    )
    .withColumn(
        "flag_nota_valida", 
        (F.col("nu_nota").isNull()) | 
        ((F.col("nu_nota") >= 1) & (F.col("nu_nota") <= 5))
    )
    .withColumn(
        "flag_qualidade",
        F.when(
            F.col("flag_pk_valida") &
            F.col("flag_fk_valida") &
            F.col("flag_nota_valida"),
            F.lit("OK")
        ).otherwise(F.lit("ERRO"))
    )
)

In [0]:
df_invalidos = (
    df_validacao
    .filter(F.col("flag_qualidade") == "ERRO")
    .withColumn(
        "motivo_rejeicao",
        F.concat_ws(
            "; ",
            F.when(~F.col("flag_pk_valida"), F.lit("cd_pesquisa NULL")),
            F.when(~F.col("flag_fk_valida"), F.lit("cd_chamado NULL")),
            F.when(~F.col("flag_nota_valida"), F.lit("nu_nota fora do range 1-5"))
        )
    )
    .withColumn("dt_validacao", F.lit(processing_timestamp))
    .drop(
        "flag_pk_valida",
        "flag_fk_valida",
        "flag_nota_valida",
        "data_processamento"
    )
)

In [0]:
total_validos = df_validacao.count()

if total_validos > 0:
    df_validacao.createOrReplaceTempView("temp_pesquisa_validos")
    spark.sql(f"""
        MERGE INTO {TABLE_TARGET_VALIDOS} AS target
        USING temp_pesquisa_validos AS source
        ON target.cd_pesquisa = source.cd_pesquisa
        
        WHEN MATCHED THEN
            UPDATE SET
                target.cd_chamado = source.cd_chamado,
                target.nu_nota = source.nu_nota,
                target.dt_ingestion = source.dt_ingestion,
                target.dc_origem = source.dc_origem
        
        WHEN NOT MATCHED THEN
            INSERT (
                cd_pesquisa,
                cd_chamado,
                nu_nota,
                dt_ingestion,
                dc_origem
            )
            VALUES (
                source.cd_pesquisa,
                source.cd_chamado,
                source.nu_nota,
                source.dt_ingestion,
                source.dc_origem
            )
    """)
    
    merge_stats = spark.sql(f"""
        SELECT 
            version,
            timestamp,
            operationMetrics.numTargetRowsInserted as inseridos,
            operationMetrics.numTargetRowsUpdated as atualizados,
            operationMetrics.numOutputRows as total_afetados
        FROM (DESCRIBE HISTORY {TABLE_TARGET_VALIDOS})
        WHERE operation = 'MERGE'
        ORDER BY version DESC
        LIMIT 1
    """)

In [0]:
total_invalidos = df_invalidos.count()
if total_invalidos > 0:
    print(f"Enviando {total_invalidos} registro(s) inválido(s) para auditoria...")
    
    (df_invalidos.write
        .format("delta")
        .mode("append")
        .saveAsTable(TABLE_TARGET_INVALIDOS))
    
else:
    print("Nenhum registro inválido - 100% de qualidade!")