In [0]:
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, trim, upper, lower, initcap, current_timestamp,
    lit, coalesce, when, regexp_replace, length, row_number, to_timestamp, levenshtein, unix_timestamp, round
)
from pyspark.sql.types import IntegerType, StringType, TimestampType
from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException
import pyspark.sql.functions as F

catalog_name = "atendimento_catalog"
bronze_db_name = "bronze"
silver_db_name = "silver"

spark.sql(f"USE CATALOG {catalog_name}")
spark.sql(f"USE SCHEMA {silver_db_name}")

In [0]:
catalog_name = "atendimento_catalog"
bronze_db_name = "bronze"
silver_db_name = "silver"

In [0]:
spark.sql(f"USE CATALOG {catalog_name}")
spark.sql(f"USE SCHEMA {silver_db_name}")

In [0]:
def safe_table_exists(spark, full_name: str) -> bool:
    try:
        spark.table(full_name)
        return True
    except:
        return False

def safe_col(df, name):
    return col(name) if name in df.columns else lit(None).cast(StringType())

def safe_cast_int(col_expr):
    return when(
        col_expr.isNotNull() & (trim(col_expr).cast(StringType()) != ""),
        F.regexp_replace(trim(col_expr).cast(StringType()), r'[^\d]', '').cast(IntegerType())
    ).otherwise(None)

def remove_accents_udf(text_col):
    return F.translate(
        text_col,
        "áàãâäéèêëíìîïóòõôöúùûüçñÁÀÃÂÄÉÈÊËÍÌÎÏÓÒÕÔÖÚÙÛÜÇÑ",
        "aaaaaeeeeiiiiooooouuuucnAAAAAEEEEIIIIOOOOOUUUUCN"
    )

def normalize_text(col_expr):
    return trim(regexp_replace(regexp_replace(col_expr, r'\s+', ' '), r'[^\x20-\x7E\u00C0-\u00FF]', ''))

def parse_to_brasilia_timezone(col_expr):
    cleaned = regexp_replace(col_expr, r'[ ]', ' ')
    cleaned = regexp_replace(cleaned, r'\bàs\b|\bas\b|\bàs\b', ' ')
    cleaned = regexp_replace(cleaned, r'\s+', ' ')
    cleaned = trim(cleaned)
    parsed_naive = to_timestamp(cleaned, 'dd/MM/yyyy HH:mm:ss')
    parsed_utc = F.to_utc_timestamp(parsed_naive, 'America/Sao_Paulo')
    return F.from_utc_timestamp(parsed_utc, 'America/Sao_Paulo')

## Processamento: ft_atendentes

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.ft_atendentes"
tgt_table = f"{catalog_name}.{silver_db_name}.ft_atendentes"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df_src = spark.table(src_table)
total_before = df_src.count()
print(f"Bronze: {total_before:,} registros")

In [0]:
df = df_src \
    .withColumn("id_atendente", safe_cast_int(safe_col(df_src, "id_atendente"))) \
    .withColumn("nome_atendente", initcap(remove_accents_udf(normalize_text(safe_col(df_src, "nome_atendente"))))) \
    .withColumn("nivel_atendimento", when(safe_cast_int(safe_col(df_src, "nivel_atendimento")).isin([1, 2]), safe_cast_int(safe_col(df_src, "nivel_atendimento"))).otherwise(None))

df_valid = df.filter(
    (col("id_atendente").isNotNull()) & (col("id_atendente") > 0) &
    (col("nome_atendente").isNotNull()) & (trim(col("nome_atendente")) != "") &
    (col("nivel_atendimento").isNotNull())
)

w = Window.partitionBy("id_atendente").orderBy(
    col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df_valid.columns else lit(datetime.now())
)

df_dedup = df_valid.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")

df_final = df_dedup.withColumn("processed_timestamp", current_timestamp())

cols = ["id_atendente", "nome_atendente", "nivel_atendimento", "processed_timestamp", "ingestion_timestamp"]
df_final = df_final.select(*[c for c in cols if c in df_final.columns])

df_typed = df_final \
    .withColumn("id_atendente", col("id_atendente").cast(IntegerType())) \
    .withColumn("nome_atendente", col("nome_atendente").cast(StringType())) \
    .withColumn("nivel_atendimento", col("nivel_atendimento").cast(IntegerType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df_final.columns else lit(None).cast(TimestampType()))

print(f"Silver: {df_typed.count():,} registros")

In [0]:
df_typed.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
print(f"Salvo: {tgt_table}")
display(spark.table(tgt_table).limit(10))

## Processamento: ft_chamados_hora

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.ft_chamados_hora"
tgt_table = f"{catalog_name}.{silver_db_name}.ft_chamados_hora"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df_src = spark.table(src_table)
total_before = df_src.count()
print(f"Bronze: {total_before:,} registros")

In [0]:
df = df_src \
    .withColumn("id_chamado", safe_cast_int(safe_col(df_src, "ID_Chamado"))) \
    .withColumn("id_cliente", trim(safe_col(df_src, "ID_Cliente"))) \
    .withColumn("hora_abertura_chamado_brasilia", parse_to_brasilia_timezone(safe_col(df_src, "Hora_Abertura_Chamado"))) \
    .withColumn("hora_inicio_atendimento_brasilia", parse_to_brasilia_timezone(safe_col(df_src, "Hora_Inicio_Atendimento"))) \
    .withColumn("hora_finalizacao_atendimento_brasilia", parse_to_brasilia_timezone(safe_col(df_src, "Hora_Finalizacao_Atendimento")))

df_valid = df.filter(
    (col("id_chamado").isNotNull()) & (col("id_chamado") > 0) &
    (col("id_cliente").isNotNull()) & (length(col("id_cliente")) > 0) &
    (col("hora_abertura_chamado_brasilia").isNotNull()) &
    (col("hora_inicio_atendimento_brasilia").isNotNull()) &
    (col("hora_finalizacao_atendimento_brasilia").isNotNull()) &
    (col("hora_abertura_chamado_brasilia") <= col("hora_inicio_atendimento_brasilia")) &
    (col("hora_inicio_atendimento_brasilia") <= col("hora_finalizacao_atendimento_brasilia"))
)

w = Window.partitionBy("id_chamado").orderBy(
    col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df_valid.columns else lit(datetime.now())
)

df_dedup = df_valid.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")

df_final = df_dedup.withColumn("processed_timestamp", current_timestamp())

cols = ["id_chamado", "id_cliente", "hora_abertura_chamado_brasilia", "hora_inicio_atendimento_brasilia", "hora_finalizacao_atendimento_brasilia", "processed_timestamp", "ingestion_timestamp"]
df_final = df_final.select(*[c for c in cols if c in df_final.columns])

df_typed = df_final \
    .withColumn("id_chamado", col("id_chamado").cast(IntegerType())) \
    .withColumn("id_cliente", col("id_cliente").cast(StringType())) \
    .withColumn("hora_abertura_chamado_brasilia", col("hora_abertura_chamado_brasilia").cast(TimestampType())) \
    .withColumn("hora_inicio_atendimento_brasilia", col("hora_inicio_atendimento_brasilia").cast(TimestampType())) \
    .withColumn("hora_finalizacao_atendimento_brasilia", col("hora_finalizacao_atendimento_brasilia").cast(TimestampType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df_final.columns else lit(None).cast(TimestampType()))

print(f"Silver: {df_typed.count():,} registros")

In [0]:
df_typed.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
silver_chamados_hora = spark.table(tgt_table)
print(f"Salvo: {tgt_table}")
display(silver_chamados_hora.limit(10))

## Processamento: dm_motivos

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.ft_motivos"
tgt_table = f"{catalog_name}.{silver_db_name}.dm_motivos"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df_src = spark.table(src_table)
total_before = df_src.count()
print(f"Bronze: {total_before:,} registros")

In [0]:
df = df_src \
    .withColumn("id_motivo", safe_cast_int(safe_col(df_src, "id_motivo"))) \
    .withColumn("nome_motivo", safe_col(df_src, "nome_motivo"))

df = df.withColumn(
    "categoria",
    when((lower(col("nome_motivo")).like("%fatura%")) | (lower(col("nome_motivo")).like("%limite%")) | (lower(col("nome_motivo")).like("%contrato%")) | (lower(col("nome_motivo")).like("%pagamento%")) | (lower(col("nome_motivo")).like("%divida%")) | (lower(col("nome_motivo")).like("%renegocia%")), lit("Financeiro"))
    .when((lower(col("nome_motivo")).like("%cartao%")) | (lower(col("nome_motivo")).like("%bloqueio%")) | (lower(col("nome_motivo")).like("%desbloqueio%")) | (lower(col("nome_motivo")).like("%compra%")) | (lower(col("nome_motivo")).like("%adicional%")), lit("Cartão"))
    .when((lower(col("nome_motivo")).like("%dados%")) | (lower(col("nome_motivo")).like("%cadastra%")) | (lower(col("nome_motivo")).like("%telefone%")) | (lower(col("nome_motivo")).like("%email%")) | (lower(col("nome_motivo")).like("%agencia%")) | (lower(col("nome_motivo")).like("%conta%")) | (lower(col("nome_motivo")).like("%endereco%")), lit("Cadastral"))
    .when((lower(col("nome_motivo")).like("%app%")) | (lower(col("nome_motivo")).like("%aplicativo%")) | (lower(col("nome_motivo")).like("%site%")) | (lower(col("nome_motivo")).like("%chatbot%")) | (lower(col("nome_motivo")).like("%ura%")) | (lower(col("nome_motivo")).like("%problema%")) | (lower(col("nome_motivo")).like("%erro%")), lit("Atendimento"))
    .when((lower(col("nome_motivo")).like("%ponto%")) | (lower(col("nome_motivo")).like("%beneficio%")) | (lower(col("nome_motivo")).like("%programa%")), lit("Benefícios"))
    .otherwise(lit("Desconhecida"))
)

df = df.withColumn("criticidade", when(initcap(remove_accents_udf(normalize_text(safe_col(df_src, "criticidade")))) == lit("Media"), lit("Média")).otherwise(initcap(remove_accents_udf(normalize_text(safe_col(df_src, "criticidade"))))))

df_valid = df.filter((col("id_motivo").isNotNull()) & (col("id_motivo") > 0) & (col("nome_motivo").isNotNull()) & (trim(col("nome_motivo")) != ""))

w = Window.partitionBy("id_motivo").orderBy(col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df_valid.columns else lit(datetime.now()))

df_dedup = df_valid.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")

df_final = df_dedup.withColumn("processed_timestamp", current_timestamp())

cols = ["id_motivo", "nome_motivo", "categoria", "criticidade", "processed_timestamp", "ingestion_timestamp"]
df_final = df_final.select(*[c for c in cols if c in df_final.columns])

df_typed = df_final \
    .withColumn("id_motivo", col("id_motivo").cast(IntegerType())) \
    .withColumn("nome_motivo", col("nome_motivo").cast(StringType())) \
    .withColumn("categoria", col("categoria").cast(StringType())) \
    .withColumn("criticidade", col("criticidade").cast(StringType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df_final.columns else lit(None).cast(TimestampType()))

print(f"Silver: {df_typed.count():,} registros")

In [0]:
df_typed.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
print(f"Salvo: {tgt_table}")
display(spark.table(tgt_table).limit(10))

## Processamento: dm_canais

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.dm_canais"
tgt_table = f"{catalog_name}.{silver_db_name}.dm_canais"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df_src = spark.table(src_table)
total_before = df_src.count()
print(f"Bronze: {total_before:,} registros")

In [0]:
df = df_src \
    .withColumn("nome_canal", initcap(remove_accents_udf(normalize_text(safe_col(df_src, "nome_canal"))))) \
    .withColumn("canal_status", initcap(remove_accents_udf(normalize_text(safe_col(df_src, "canal_status")))))

df = df.withColumn("canal_status", when(col("canal_status") == lit("Invativo"), lit("Ativo")).otherwise(col("canal_status")))
df = df.withColumn("canal_status", when(col("nome_canal") == "Web", lit("Inativo")).otherwise(col("canal_status")))

w_id = Window.orderBy("canal_status")
df = df.withColumn("id_canal", row_number().over(w_id))

df_valid = df.filter((col("nome_canal").isNotNull()) & (trim(col("nome_canal")) != ""))

w = Window.partitionBy("nome_canal").orderBy(col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df_valid.columns else lit(datetime.now()))

df_dedup = df_valid.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")

df_final = df_dedup.withColumn("processed_timestamp", current_timestamp())

cols = ["id_canal", "nome_canal", "canal_status", "processed_timestamp", "ingestion_timestamp"]
df_final = df_final.select(*[c for c in cols if c in df_final.columns])

df_typed = df_final \
    .withColumn("id_canal", col("id_canal").cast(IntegerType())) \
    .withColumn("nome_canal", col("nome_canal").cast(StringType())) \
    .withColumn("canal_status", col("canal_status").cast(StringType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df_final.columns else lit(None).cast(TimestampType()))

print(f"Silver: {df_typed.count():,} registros")

In [0]:
df_typed.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
silver_canais = spark.table(tgt_table)
print(f"Salvo: {tgt_table}")
display(silver_canais.limit(10))

## Processamento: dm_clientes

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.dm_clientes"
tgt_table = f"{catalog_name}.{silver_db_name}.dm_clientes"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df_src = spark.table(src_table)
total_before = df_src.count()
print(f"Bronze: {total_before:,} registros")

In [0]:
df = df_src \
    .withColumn("id_cliente", trim(safe_col(df_src, "id_cliente"))) \
    .withColumn("nome", initcap(remove_accents_udf(normalize_text(safe_col(df_src, "nome"))))) \
    .withColumn("email", lower(trim(safe_col(df_src, "email")))) \
    .withColumn("regiao", initcap(remove_accents_udf(normalize_text(safe_col(df_src, "regiao"))))) \
    .withColumn("idade", safe_cast_int(safe_col(df_src, "idade")))

df_valid = df.filter(
    (col("id_cliente").isNotNull()) & (trim(col("id_cliente")) != "") &
    (col("nome").isNotNull()) & (trim(col("nome")) != "") &
    (col("email").isNotNull()) & (trim(col("email")) != "") &
    (col("email").contains("@"))
)

w = Window.partitionBy("id_cliente").orderBy(col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df_valid.columns else lit(datetime.now()))

df_dedup = df_valid.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")

df_final = df_dedup.withColumn("processed_timestamp", current_timestamp())

cols = ["id_cliente", "nome", "email", "regiao", "idade", "processed_timestamp", "ingestion_timestamp"]
df_final = df_final.select(*[c for c in cols if c in df_final.columns])

df_typed = df_final \
    .withColumn("id_cliente", col("id_cliente").cast(StringType())) \
    .withColumn("nome", col("nome").cast(StringType())) \
    .withColumn("email", col("email").cast(StringType())) \
    .withColumn("regiao", col("regiao").cast(StringType())) \
    .withColumn("idade", col("idade").cast(IntegerType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df_final.columns else lit(None).cast(TimestampType()))

print(f"Silver: {df_typed.count():,} registros")

In [0]:
df_typed.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
print(f"Salvo: {tgt_table}")
display(spark.table(tgt_table).limit(10))

## Processamento: ft_pesquisa_satisfacao

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.ft_pesquisa_satisfacao"
tgt_table = f"{catalog_name}.{silver_db_name}.ft_pesquisa_satisfacao"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df_src = spark.table(src_table)
total_before = df_src.count()
print(f"Bronze: {total_before:,} registros")

In [0]:
df = df_src \
    .withColumn("id_pesquisa", safe_cast_int(safe_col(df_src, "id_pesquisa"))) \
    .withColumn("id_chamado", safe_cast_int(safe_col(df_src, "id_chamado"))) \
    .withColumn("nota_atendimento", safe_cast_int(safe_col(df_src, "nota_atendimento")))

df_valid = df.filter(
    (col("id_pesquisa").isNotNull()) & (col("id_pesquisa") > 0) &
    (col("id_chamado").isNotNull()) & (col("id_chamado") > 0) &
    (col("nota_atendimento").isNotNull()) &
    (col("nota_atendimento") >= 1) & (col("nota_atendimento") <= 5)
)

w = Window.partitionBy("id_pesquisa").orderBy(col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df_valid.columns else lit(datetime.now()))

df_dedup = df_valid.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")

df_final = df_dedup.withColumn("processed_timestamp", current_timestamp())

cols = ["id_pesquisa", "id_chamado", "nota_atendimento", "processed_timestamp", "ingestion_timestamp"]
df_final = df_final.select(*[c for c in cols if c in df_final.columns])

df_typed = df_final \
    .withColumn("id_pesquisa", col("id_pesquisa").cast(IntegerType())) \
    .withColumn("id_chamado", col("id_chamado").cast(IntegerType())) \
    .withColumn("nota_atendimento", col("nota_atendimento").cast(IntegerType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df_final.columns else lit(None).cast(TimestampType()))

print(f"Silver: {df_typed.count():,} registros")

In [0]:
df_typed.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
print(f"Salvo: {tgt_table}")
display(spark.table(tgt_table).limit(10))

## Processamento: ft_chamados

In [0]:
src_table_chamados = f"{catalog_name}.{bronze_db_name}.ft_chamados"
src_table_motivos = f"{catalog_name}.{bronze_db_name}.ft_motivos"
tgt_table = f"{catalog_name}.{silver_db_name}.ft_chamados"

if not safe_table_exists(spark, src_table_chamados):
    raise RuntimeError(f"Tabela não encontrada: {src_table_chamados}")

df_chamados_src = spark.table(src_table_chamados)
df_motivos_src = spark.table(src_table_motivos)
total_before = df_chamados_src.count()
print(f"Bronze: {total_before:,} registros")

In [0]:
col_canal_norm = upper(remove_accents_udf(normalize_text(col("canal"))))
col_canal_final = (
    when(col_canal_norm.like("%ESPECIALIZADO%"), lit("Atendimento Especializado"))
    .when(col_canal_norm.like("%INICIAL%"), lit("Atendimento Inicial"))
    .when(col_canal_norm.like("U%"), lit("Ura"))
    .when(col_canal_norm.like("%BOT%"), lit("Chatbot"))
    .when(col_canal_norm.like("%WEB%"), lit("Web"))
    .when(col_canal_norm.like("%mail%"), lit("Email"))
    .otherwise(col_canal_norm)
)

col_resolvido_norm = upper(remove_accents_udf(normalize_text(col("resolvido"))))
col_resolvido_final = when(col_resolvido_norm.like("S%"), lit("Sim")).when(col_resolvido_norm.like("N%"), lit("Não")).otherwise(col_resolvido_norm)

df = df_chamados_src \
    .withColumn("canal", col_canal_final) \
    .withColumn("resolvido", col_resolvido_final) \
    .withColumn("motivo_norm", upper(remove_accents_udf(normalize_text(col("motivo")))))

df_motivos = df_motivos_src.withColumn("nome_motivo_norm", upper(remove_accents_udf(normalize_text(col("nome_motivo")))))

df_cross = df.alias("c").crossJoin(df_motivos.alias("m"))
df_cross = df_cross.withColumn("similarity", (1 - (levenshtein(col("motivo_norm"), col("nome_motivo_norm")) / F.greatest(length(col("motivo_norm")), length(col("nome_motivo_norm"))))) * 100)

window = Window.partitionBy("c.id_chamado").orderBy(col("similarity").desc())
df_best = df_cross.withColumn("rank", F.row_number().over(window)).filter(col("rank") == 1)

df_final = df_best.alias("best").join(silver_canais, col("best.canal") == silver_canais.nome_canal, "left")

df_final = df_final.join(
    silver_chamados_hora.alias("h"),
    (col("best.id_chamado") == col("h.id_chamado")) & (col("best.id_cliente") == col("h.id_cliente")),
    "left"
)

df_final = df_final.select(
    col("best.id_chamado"),
    col("best.id_cliente"),
    col("best.id_motivo"),
    col("best.motivo"),
    col("id_canal"),
    col("best.canal"),
    col("best.resolvido"),
    coalesce(col("h.hora_abertura_chamado_brasilia"), col("best.hora_abertura_chamado")).alias("hora_abertura_chamado"),
    when(col("best.hora_inicio_atendimento") == "igual a hora de abertura", coalesce(col("h.hora_abertura_chamado_brasilia"), col("best.hora_abertura_chamado"))).otherwise(coalesce(col("h.hora_inicio_atendimento_brasilia"), col("best.hora_inicio_atendimento"))).alias("hora_inicio_atendimento"),
    coalesce(col("h.hora_finalizacao_atendimento_brasilia"), col("best.hora_finalizacao_atendimento")).alias("hora_finalizacao_atendimento"),
    col("best.id_atendente"),
    current_timestamp().alias("processed_timestamp")
)

df_final = df_final.withColumn(
    "tempo_espera_minutos",
    when((col("hora_inicio_atendimento").isNotNull()) & (col("hora_abertura_chamado").isNotNull()), round((unix_timestamp(col("hora_inicio_atendimento")) - unix_timestamp(col("hora_abertura_chamado"))) / 60.0, 2)).otherwise(lit(None))
)

df_final = df_final.withColumn(
    "tempo_atendimento_minutos",
    when((col("hora_finalizacao_atendimento").isNotNull()) & (col("hora_inicio_atendimento").isNotNull()), round((unix_timestamp(col("hora_finalizacao_atendimento")) - unix_timestamp(col("hora_inicio_atendimento"))) / 60.0, 2)).otherwise(lit(None))
)

print(f"Silver: {df_final.count():,} registros")

In [0]:
df_final.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
print(f"Salvo: {tgt_table}")
display(spark.table(tgt_table).limit(10))

## Processamento: ft_custos

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.ft_custos"
tgt_table = f"{catalog_name}.{silver_db_name}.ft_custos"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df_src = spark.table(src_table)
total_before = df_src.count()
print(f"Bronze: {total_before:,} registros")

In [0]:
df = df_src \
    .withColumn("id_chamado", safe_cast_int(safe_col(df_src, "id_chamado"))) \
    .withColumn("id_custo", safe_cast_int(safe_col(df_src, "id_custo"))) \
    .withColumn("custo", regexp_replace(safe_col(df_src, "custo"), "[^0-9,.-]", "")) \
    .withColumn("custo", regexp_replace(col("custo"), ",", ".")) \
    .withColumn("custo", col("custo").cast("decimal(18,8)"))

df_valid = df.filter(
    (col("id_chamado").isNotNull()) & (col("id_chamado") > 0) &
    (col("id_custo").isNotNull()) & (col("id_custo") > 0) &
    (col("custo").isNotNull()) & (col("custo") >= 0)
)

w = Window.partitionBy("id_chamado", "id_custo").orderBy(col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df_valid.columns else lit(datetime.now()))

df_dedup = df_valid.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")

df_final = df_dedup.withColumn("processed_timestamp", current_timestamp())

cols = ["id_chamado", "id_custo", "custo", "processed_timestamp", "ingestion_timestamp"]
df_final = df_final.select(*[c for c in cols if c in df_final.columns])

df_typed = df_final \
    .withColumn("id_chamado", col("id_chamado").cast(IntegerType())) \
    .withColumn("id_custo", col("id_custo").cast(IntegerType())) \
    .withColumn("custo", col("custo").cast("decimal(18,8)")) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df_final.columns else lit(None).cast(TimestampType()))

print(f"Silver: {df_typed.count():,} registros")

In [0]:
df_typed.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
print(f"Salvo: {tgt_table}")
display(spark.table(tgt_table).limit(10))

In [0]:
def safe_table_exists(spark, full_name: str) -> bool:
    try:
        spark.table(full_name)
        return True
    except:
        return False

def safe_col(df, name):
    return col(name) if name in df.columns else lit(None).cast(StringType())

def safe_cast_int(col_expr):
    return when(
        col_expr.isNotNull() & (trim(col_expr).cast(StringType()) != ""),
        F.regexp_replace(trim(col_expr).cast(StringType()), r'[^\d]', '').cast(IntegerType())
    ).otherwise(None)

def remove_accents(text_col):
    return F.translate(
        text_col,
        "áàãâäéèêëíìîïóòõôöúùûüçñÁÀÃÂÄÉÈÊËÍÌÎÏÓÒÕÔÖÚÙÛÜÇÑ",
        "aaaaaeeeeiiiiooooouuuucnAAAAAEEEEIIIIOOOOOUUUUCN"
    )

def normalize_text(col_expr):
    return trim(regexp_replace(regexp_replace(col_expr, r'\s+', ' '), r'[^\x20-\x7E\u00C0-\u00FF]', ''))

def clean_brazil_timestamp(col_expr):
    cleaned = regexp_replace(col_expr, r'[ ]', ' ')
    cleaned = regexp_replace(cleaned, r'\bàs\b|\bas\b|\bàs\b', ' ')
    cleaned = regexp_replace(cleaned, r'\s+', ' ')
    return trim(cleaned)

def parse_to_brasilia_timezone(col_expr):
    cleaned = clean_brazil_timestamp(col_expr)
    parsed_naive = to_timestamp(cleaned, 'dd/MM/yyyy HH:mm:ss')
    parsed_utc = F.to_utc_timestamp(parsed_naive, 'America/Sao_Paulo')
    return F.from_utc_timestamp(parsed_utc, 'America/Sao_Paulo')

## FT_ATENDENTES

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.ft_atendentes"
tgt_table = f"{catalog_name}.{silver_db_name}.ft_atendentes"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df = spark.table(src_table)
total_before = df.count()

df = df.withColumn("id_atendente", safe_cast_int(safe_col(df, "id_atendente"))) \
    .withColumn("nome_atendente", initcap(remove_accents(normalize_text(safe_col(df, "nome_atendente"))))) \
    .withColumn(
        "nivel_atendimento",
        when(safe_cast_int(safe_col(df, "nivel_atendimento")).isin([1, 2]), safe_cast_int(safe_col(df, "nivel_atendimento"))).otherwise(None)
    )

df = df.filter(
    (col("id_atendente").isNotNull()) & (col("id_atendente") > 0) &
    (col("nome_atendente").isNotNull()) & (trim(col("nome_atendente")) != "") &
    (col("nivel_atendimento").isNotNull())
)

w = Window.partitionBy("id_atendente").orderBy(
    col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df.columns else lit(datetime.now())
)

df = df.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")
df = df.withColumn("processed_timestamp", current_timestamp())

final_cols = ["id_atendente", "nome_atendente", "nivel_atendimento", "processed_timestamp", "ingestion_timestamp"]
df = df.select(*[c for c in final_cols if c in df.columns])
df = df.withColumn("id_atendente", col("id_atendente").cast(IntegerType())) \
    .withColumn("nome_atendente", col("nome_atendente").cast(StringType())) \
    .withColumn("nivel_atendimento", col("nivel_atendimento").cast(IntegerType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df.columns else lit(None).cast(TimestampType()))

df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
final = spark.table(tgt_table)

print(f"FT_ATENDENTES | Bronze: {total_before:,} | Silver: {final.count():,} | Taxa: {(final.count()/total_before*100):.1f}%")

## FT_CHAMADOS_HORA

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.ft_chamados_hora"
tgt_table = f"{catalog_name}.{silver_db_name}.ft_chamados_hora"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df = spark.table(src_table)
total_before = df.count()

df = df.withColumn("id_chamado", safe_cast_int(safe_col(df, "ID_Chamado"))) \
    .withColumn("id_cliente", trim(safe_col(df, "ID_Cliente"))) \
    .withColumn("hora_abertura_chamado_brasilia", parse_to_brasilia_timezone(safe_col(df, "Hora_Abertura_Chamado"))) \
    .withColumn("hora_inicio_atendimento_brasilia", parse_to_brasilia_timezone(safe_col(df, "Hora_Inicio_Atendimento"))) \
    .withColumn("hora_finalizacao_atendimento_brasilia", parse_to_brasilia_timezone(safe_col(df, "Hora_Finalizacao_Atendimento")))

df = df.filter(
    (col("id_chamado").isNotNull()) & (col("id_chamado") > 0) &
    (col("id_cliente").isNotNull()) & (length(col("id_cliente")) > 0) &
    (col("hora_abertura_chamado_brasilia").isNotNull()) &
    (col("hora_inicio_atendimento_brasilia").isNotNull()) &
    (col("hora_finalizacao_atendimento_brasilia").isNotNull()) &
    (col("hora_abertura_chamado_brasilia") <= col("hora_inicio_atendimento_brasilia")) &
    (col("hora_inicio_atendimento_brasilia") <= col("hora_finalizacao_atendimento_brasilia"))
)

w = Window.partitionBy("id_chamado").orderBy(
    col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df.columns else lit(datetime.now())
)

df = df.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")
df = df.withColumn("processed_timestamp", current_timestamp())

final_cols = ["id_chamado", "id_cliente", "hora_abertura_chamado_brasilia", "hora_inicio_atendimento_brasilia", "hora_finalizacao_atendimento_brasilia", "processed_timestamp", "ingestion_timestamp"]
df = df.select(*[c for c in final_cols if c in df.columns])
df = df.withColumn("id_chamado", col("id_chamado").cast(IntegerType())) \
    .withColumn("id_cliente", col("id_cliente").cast(StringType())) \
    .withColumn("hora_abertura_chamado_brasilia", col("hora_abertura_chamado_brasilia").cast(TimestampType())) \
    .withColumn("hora_inicio_atendimento_brasilia", col("hora_inicio_atendimento_brasilia").cast(TimestampType())) \
    .withColumn("hora_finalizacao_atendimento_brasilia", col("hora_finalizacao_atendimento_brasilia").cast(TimestampType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df.columns else lit(None).cast(TimestampType()))

df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
final = spark.table(tgt_table)

print(f"FT_CHAMADOS_HORA | Bronze: {total_before:,} | Silver: {final.count():,} | Taxa: {(final.count()/total_before*100):.1f}%")

## DM_MOTIVOS

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.ft_motivos"
tgt_table = f"{catalog_name}.{silver_db_name}.dm_motivos"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df = spark.table(src_table)
total_before = df.count()

df = df.withColumn("id_motivo", safe_cast_int(safe_col(df, "id_motivo"))) \
    .withColumn("nome_motivo", safe_col(df, "nome_motivo")) \
    .withColumn(
        "categoria",
        when(lower(col("nome_motivo")).like("%fatura%") | lower(col("nome_motivo")).like("%limite%") | lower(col("nome_motivo")).like("%contrato%") | lower(col("nome_motivo")).like("%pagamento%") | lower(col("nome_motivo")).like("%divida%") | lower(col("nome_motivo")).like("%renegocia%"), lit("Financeiro"))
        .when(lower(col("nome_motivo")).like("%cartao%") | lower(col("nome_motivo")).like("%bloqueio%") | lower(col("nome_motivo")).like("%desbloqueio%") | lower(col("nome_motivo")).like("%compra%") | lower(col("nome_motivo")).like("%adicional%"), lit("Cartão"))
        .when(lower(col("nome_motivo")).like("%dados%") | lower(col("nome_motivo")).like("%cadastra%") | lower(col("nome_motivo")).like("%telefone%") | lower(col("nome_motivo")).like("%email%") | lower(col("nome_motivo")).like("%agencia%") | lower(col("nome_motivo")).like("%conta%") | lower(col("nome_motivo")).like("%endereco%"), lit("Cadastral"))
        .when(lower(col("nome_motivo")).like("%app%") | lower(col("nome_motivo")).like("%aplicativo%") | lower(col("nome_motivo")).like("%site%") | lower(col("nome_motivo")).like("%chatbot%") | lower(col("nome_motivo")).like("%ura%") | lower(col("nome_motivo")).like("%problema%") | lower(col("nome_motivo")).like("%erro%"), lit("Atendimento"))
        .when(lower(col("nome_motivo")).like("%ponto%") | lower(col("nome_motivo")).like("%beneficio%") | lower(col("nome_motivo")).like("%programa%"), lit("Benefícios"))
        .otherwise(lit("Desconhecida"))
    ) \
    .withColumn("criticidade", when(col("criticidade") == lit("Media"), lit("Média")).otherwise(col("criticidade")))

df = df.filter(
    (col("id_motivo").isNotNull()) & (col("id_motivo") > 0) &
    (col("nome_motivo").isNotNull()) & (trim(col("nome_motivo")) != "")
)

w = Window.partitionBy("id_motivo").orderBy(
    col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df.columns else lit(datetime.now())
)

df = df.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")
df = df.withColumn("processed_timestamp", current_timestamp())

final_cols = ["id_motivo", "nome_motivo", "categoria", "criticidade", "processed_timestamp", "ingestion_timestamp"]
df = df.select(*[c for c in final_cols if c in df.columns])
df = df.withColumn("id_motivo", col("id_motivo").cast(IntegerType())) \
    .withColumn("nome_motivo", col("nome_motivo").cast(StringType())) \
    .withColumn("categoria", col("categoria").cast(StringType())) \
    .withColumn("criticidade", col("criticidade").cast(StringType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df.columns else lit(None).cast(TimestampType()))

df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
final = spark.table(tgt_table)

print(f"DM_MOTIVOS | Bronze: {total_before:,} | Silver: {final.count():,} | Taxa: {(final.count()/total_before*100):.1f}%")

## DM_CANAIS

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.dm_canais"
tgt_table = f"{catalog_name}.{silver_db_name}.dm_canais"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df = spark.table(src_table)
total_before = df.count()

df = df.withColumn("nome_canal", initcap(remove_accents(normalize_text(safe_col(df, "nome_canal"))))) \
    .withColumn("canal_status", initcap(remove_accents(normalize_text(safe_col(df, "canal_status"))))) \
    .withColumn(
        "canal_status",
        when(col("canal_status") == lit("Invativo"), lit("Ativo"))
        .when(col("nome_canal") == "Web", lit("Inativo"))
        .otherwise(col("canal_status"))
    )

w_id = Window.orderBy("canal_status")
df = df.withColumn("id_canal", row_number().over(w_id))

df = df.filter((col("nome_canal").isNotNull()) & (trim(col("nome_canal")) != ""))

w = Window.partitionBy("nome_canal").orderBy(
    col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df.columns else lit(datetime.now())
)

df = df.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")
df = df.withColumn("processed_timestamp", current_timestamp())

final_cols = ["id_canal", "nome_canal", "canal_status", "processed_timestamp", "ingestion_timestamp"]
df = df.select(*[c for c in final_cols if c in df.columns])
df = df.withColumn("id_canal", col("id_canal").cast(IntegerType())) \
    .withColumn("nome_canal", col("nome_canal").cast(StringType())) \
    .withColumn("canal_status", col("canal_status").cast(StringType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df.columns else lit(None).cast(TimestampType()))

df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
final = spark.table(tgt_table)

print(f"DM_CANAIS | Bronze: {total_before:,} | Silver: {final.count():,} | Taxa: {(final.count()/total_before*100):.1f}%")

## DM_CLIENTES

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.dm_clientes"
tgt_table = f"{catalog_name}.{silver_db_name}.dm_clientes"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df = spark.table(src_table)
total_before = df.count()

df = df.withColumn("id_cliente", trim(safe_col(df, "id_cliente"))) \
    .withColumn("nome", initcap(remove_accents(normalize_text(safe_col(df, "nome"))))) \
    .withColumn("email", lower(trim(safe_col(df, "email")))) \
    .withColumn("regiao", initcap(remove_accents(normalize_text(safe_col(df, "regiao"))))) \
    .withColumn("idade", safe_cast_int(safe_col(df, "idade")))

df = df.filter(
    (col("id_cliente").isNotNull()) & (trim(col("id_cliente")) != "") &
    (col("nome").isNotNull()) & (trim(col("nome")) != "") &
    (col("email").isNotNull()) & (trim(col("email")) != "") & (col("email").contains("@"))
)

w = Window.partitionBy("id_cliente").orderBy(
    col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df.columns else lit(datetime.now())
)

df = df.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")
df = df.withColumn("processed_timestamp", current_timestamp())

final_cols = ["id_cliente", "nome", "email", "regiao", "idade", "processed_timestamp", "ingestion_timestamp"]
df = df.select(*[c for c in final_cols if c in df.columns])
df = df.withColumn("id_cliente", col("id_cliente").cast(StringType())) \
    .withColumn("nome", col("nome").cast(StringType())) \
    .withColumn("email", col("email").cast(StringType())) \
    .withColumn("regiao", col("regiao").cast(StringType())) \
    .withColumn("idade", col("idade").cast(IntegerType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df.columns else lit(None).cast(TimestampType()))

df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
final = spark.table(tgt_table)

print(f"DM_CLIENTES | Bronze: {total_before:,} | Silver: {final.count():,} | Taxa: {(final.count()/total_before*100):.1f}%")

## FT_PESQUISA_SATISFACAO

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.ft_pesquisa_satisfacao"
tgt_table = f"{catalog_name}.{silver_db_name}.ft_pesquisa_satisfacao"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df = spark.table(src_table)
total_before = df.count()

df = df.withColumn("id_pesquisa", safe_cast_int(safe_col(df, "id_pesquisa"))) \
    .withColumn("id_chamado", safe_cast_int(safe_col(df, "id_chamado"))) \
    .withColumn("nota_atendimento", safe_cast_int(safe_col(df, "nota_atendimento")))

df = df.filter(
    (col("id_pesquisa").isNotNull()) & (col("id_pesquisa") > 0) &
    (col("id_chamado").isNotNull()) & (col("id_chamado") > 0) &
    (col("nota_atendimento").isNotNull()) & (col("nota_atendimento") >= 1) & (col("nota_atendimento") <= 5)
)

w = Window.partitionBy("id_pesquisa").orderBy(
    col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df.columns else lit(datetime.now())
)

df = df.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")
df = df.withColumn("processed_timestamp", current_timestamp())

final_cols = ["id_pesquisa", "id_chamado", "nota_atendimento", "processed_timestamp", "ingestion_timestamp"]
df = df.select(*[c for c in final_cols if c in df.columns])
df = df.withColumn("id_pesquisa", col("id_pesquisa").cast(IntegerType())) \
    .withColumn("id_chamado", col("id_chamado").cast(IntegerType())) \
    .withColumn("nota_atendimento", col("nota_atendimento").cast(IntegerType())) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df.columns else lit(None).cast(TimestampType()))

df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
final = spark.table(tgt_table)

print(f"FT_PESQUISA_SATISFACAO | Bronze: {total_before:,} | Silver: {final.count():,} | Taxa: {(final.count()/total_before*100):.1f}%")

## FT_CHAMADOS

In [0]:
path_chamados = f"{catalog_name}.{bronze_db_name}.ft_chamados"
path_motivos = f"{catalog_name}.{bronze_db_name}.ft_motivos"
tgt_table = f"{catalog_name}.{silver_db_name}.ft_chamados"
src_canais = f"{catalog_name}.{silver_db_name}.dm_canais"
src_hora = f"{catalog_name}.{silver_db_name}.ft_chamados_hora"

if not safe_table_exists(spark, path_chamados):
    raise RuntimeError(f"Tabela não encontrada: {path_chamados}")

df = spark.table(path_chamados)
total_before = df.count()

df_motivos = spark.table(path_motivos).withColumn("nome_motivo_norm", upper(remove_accents(normalize_text(col("nome_motivo")))))
df_canais = spark.table(src_canais)
df_hora = spark.table(src_hora)

col_canal_final = (
    when(upper(remove_accents(normalize_text(col("canal")))).like("%ESPECIALIZADO%"), lit("Atendimento Especializado"))
    .when(upper(remove_accents(normalize_text(col("canal")))).like("%INICIAL%"), lit("Atendimento Inicial"))
    .when(upper(remove_accents(normalize_text(col("canal")))).like("U%"), lit("Ura"))
    .when(upper(remove_accents(normalize_text(col("canal")))).like("%BOT%"), lit("Chatbot"))
    .when(upper(remove_accents(normalize_text(col("canal")))).like("%WEB%"), lit("Web"))
    .when(upper(remove_accents(normalize_text(col("canal")))).like("%mail%"), lit("Email"))
    .otherwise(upper(remove_accents(normalize_text(col("canal")))))
)

col_resolvido_final = (
    when(upper(remove_accents(normalize_text(col("resolvido")))).like("S%"), lit("Sim"))
    .when(upper(remove_accents(normalize_text(col("resolvido")))).like("N%"), lit("Não"))
    .otherwise(upper(remove_accents(normalize_text(col("resolvido")))))
)

df = df.withColumn("canal", col_canal_final).withColumn("resolvido", col_resolvido_final)
df = df.withColumn("motivo_norm", upper(remove_accents(normalize_text(col("motivo")))))

df_cross = df.alias("c").crossJoin(df_motivos.alias("m"))
df_cross = df_cross.withColumn(
    "similarity",
    (1 - (levenshtein(col("motivo_norm"), col("nome_motivo_norm")) / F.greatest(length(col("motivo_norm")), length(col("nome_motivo_norm"))))) * 100
)

w_sim = Window.partitionBy("c.id_chamado").orderBy(col("similarity").desc())
df_best = df_cross.withColumn("rank", F.row_number().over(w_sim)).filter(col("rank") == 1)

df_final = df_best.join(df_canais, df_best.canal == df_canais.nome_canal, "left") \
    .join(df_hora.alias("h"), [col("c.id_chamado") == col("h.id_chamado"), col("c.id_cliente") == col("h.id_cliente")], "left")

df_final = df_final.select(
    col("c.id_chamado"),
    col("c.id_cliente"),
    col("m.id_motivo"),
    col("m.nome_motivo").alias("motivo"),
    col("id_canal"),
    col("c.canal"),
    col("c.resolvido"),
    coalesce(col("h.hora_abertura_chamado_brasilia"), col("c.hora_abertura_chamado")).alias("hora_abertura_chamado"),
    when(col("c.hora_inicio_atendimento") == "igual a hora de abertura", coalesce(col("h.hora_abertura_chamado_brasilia"), col("c.hora_abertura_chamado")))
        .otherwise(coalesce(col("h.hora_inicio_atendimento_brasilia"), col("c.hora_inicio_atendimento"))).alias("hora_inicio_atendimento"),
    coalesce(col("h.hora_finalizacao_atendimento_brasilia"), col("c.hora_finalizacao_atendimento")).alias("hora_finalizacao_atendimento"),
    col("c.tempo_espera"),
    col("c.tempo_atendimento"),
    col("c.id_atendente"),
    current_timestamp().alias("processed_timestamp")
)

df_final = df_final.withColumn(
    "tempo_espera",
    when((col("hora_inicio_atendimento").isNotNull()) & (col("hora_abertura_chamado").isNotNull()),
        round((unix_timestamp(col("hora_inicio_atendimento")) - unix_timestamp(col("hora_abertura_chamado"))) / 60.0, 2)
    ).otherwise(lit(None))
).withColumn(
    "tempo_atendimento",
    when((col("hora_finalizacao_atendimento").isNotNull()) & (col("hora_inicio_atendimento").isNotNull()),
        round((unix_timestamp(col("hora_finalizacao_atendimento")) - unix_timestamp(col("hora_inicio_atendimento"))) / 60.0, 2)
    ).otherwise(lit(None))
).withColumnRenamed("tempo_espera", "tempo_espera_minutos").withColumnRenamed("tempo_atendimento", "tempo_atendimento_minutos")

df_final.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
final = spark.table(tgt_table)

print(f"FT_CHAMADOS | Bronze: {total_before:,} | Silver: {final.count():,} | Taxa: {(final.count()/total_before*100):.1f}%")

## FT_CUSTOS

In [0]:
src_table = f"{catalog_name}.{bronze_db_name}.ft_custos"
tgt_table = f"{catalog_name}.{silver_db_name}.ft_custos"

if not safe_table_exists(spark, src_table):
    raise RuntimeError(f"Tabela não encontrada: {src_table}")

df = spark.table(src_table)
total_before = df.count()

df = df.withColumn("id_chamado", safe_cast_int(safe_col(df, "id_chamado"))) \
    .withColumn("id_custo", safe_cast_int(safe_col(df, "id_custo"))) \
    .withColumn("custo", regexp_replace(regexp_replace(regexp_replace(safe_col(df, "custo"), "[^0-9,.-]", ""), ",", "."), r'^\.$', "0").cast("decimal(18,8)"))

df = df.filter(
    (col("id_chamado").isNotNull()) & (col("id_chamado") > 0) &
    (col("id_custo").isNotNull()) & (col("id_custo") > 0) &
    (col("custo").isNotNull()) & (col("custo") >= 0)
)

w = Window.partitionBy("id_chamado", "id_custo").orderBy(
    col("ingestion_timestamp").desc_nulls_last() if "ingestion_timestamp" in df.columns else lit(datetime.now())
)

df = df.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")
df = df.withColumn("processed_timestamp", current_timestamp())

final_cols = ["id_chamado", "id_custo", "custo", "processed_timestamp", "ingestion_timestamp"]
df = df.select(*[c for c in final_cols if c in df.columns])
df = df.withColumn("id_chamado", col("id_chamado").cast(IntegerType())) \
    .withColumn("id_custo", col("id_custo").cast(IntegerType())) \
    .withColumn("custo", col("custo").cast("decimal(18,8)")) \
    .withColumn("processed_timestamp", col("processed_timestamp").cast(TimestampType())) \
    .withColumn("ingestion_timestamp", col("ingestion_timestamp").cast(TimestampType()) if "ingestion_timestamp" in df.columns else lit(None).cast(TimestampType()))

df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(tgt_table)
final = spark.table(tgt_table)

print(f"FT_CUSTOS | Bronze: {total_before:,} | Silver: {final.count():,} | Taxa: {(final.count()/total_before*100):.1f}%")

## Resumo da Transformação

In [0]:
tables = [
    "ft_atendentes",
    "ft_chamados_hora",
    "dm_motivos",
    "dm_canais",
    "dm_clientes",
    "ft_pesquisa_satisfacao",
    "ft_chamados",
    "ft_custos"
]

print("\n" + "="*80)
print("TRANSFORMAÇÃO BRONZE → SILVER FINALIZADA")
print("="*80)
for table in tables:
    try:
        tgt = f"{catalog_name}.{silver_db_name}.{table}"
        count = spark.table(tgt).count()
        print(f"✓ {table:30} | Registros: {count:>12,}")
    except:
        print(f"✗ {table:30} | Erro ao validar")
print("="*80)
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)