In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import LongType, StringType, BooleanType, TimestampType, IntegerType, ShortType
from delta.tables import DeltaTable
from datetime import datetime
from pyspark.sql.functions import col, regexp_replace, sha2, when, lit, uuid

CATALOGO_ORIGEM = "v_credit"
SCHEMA_ORIGEM = "bronze"
TABELA_ORIGEM = "chamados_hora"

CATALOGO_DESTINO = "v_credit"
SCHEMA_DESTINO = "silver"
TABELA_DESTINO = "tb_chamado_log"
TABELA_INVALIDOS_DESTINO = "tb_chamado_log_invalidos"

nome_tabela_origem = f"{CATALOGO_ORIGEM}.{SCHEMA_ORIGEM}.{TABELA_ORIGEM}"
nome_tabela_destino = f"{CATALOGO_DESTINO}.{SCHEMA_DESTINO}.{TABELA_DESTINO}"

timestamp_atual = F.current_timestamp()

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.types import LongType, TimestampType, StringType

df_chamados_hora = spark.table("v_credit.bronze.chamados_hora") 

tb_chamado_log_limpo = (
    df_chamados_hora
    .withColumnRenamed("id_chamado", "cd_chamado")
    .withColumnRenamed("id_cliente", "cd_cliente")
    .withColumnRenamed("hora_abertura_chamado", "dh_abertura")
    .withColumnRenamed("hora_inicio_atendimento", "dh_inicio")
    .withColumnRenamed("hora_finalizacao_atendimento", "dh_fim")
    .withColumnRenamed("ingestion_timestamp", "dt_ingestion")
    .withColumnRenamed("origem", "dc_origem")
    
    .withColumn("cd_chamado", col("cd_chamado").cast(LongType()))
    .withColumn("cd_cliente", col("cd_cliente").cast(StringType()))
    
    .withColumn("dh_abertura", col("dh_abertura"))
    .withColumn("dh_inicio", col("dh_inicio"))
    .withColumn("dh_fim", col("dh_fim"))
    
    .withColumn("dt_ingestion", col("dt_ingestion").cast(TimestampType()))
    
    .withColumn("dc_origem", col("dc_origem").cast(StringType()))
    
    # Drop columns
    .drop("ctid_fivetran_id", "_fivetran_deleted", "_fivetran_synced")
)

display(tb_chamado_log_limpo)

In [0]:
tb_chamado_log_limpo = (
    tb_chamado_log_limpo.
    withColumn("cd_cliente", sha2(F.col("cd_cliente").cast(StringType()), 256))
)


In [0]:
from pyspark.sql.functions import col, regexp_replace, to_timestamp
from pyspark.sql.types import TimestampType

def clean_and_convert_timestamp_col(df, raw_col_name):
    DATE_FORMAT = "dd/MM/yyyy HH:mm:ss"
    
    df_cleaned = df.withColumn(
        raw_col_name,
        regexp_replace(col(raw_col_name), r'\s?[\W_]*s\s?', ' ')
    )

    df_cleaned = df_cleaned.withColumn(
        raw_col_name,
        to_timestamp(col(raw_col_name), DATE_FORMAT).cast(TimestampType())
    )

    return df_cleaned

tb_chamado_log_validacao = clean_and_convert_timestamp_col(tb_chamado_log_limpo, "dh_abertura")
tb_chamado_log_validacao = clean_and_convert_timestamp_col(tb_chamado_log_validacao, "dh_inicio")
tb_chamado_log_validacao = clean_and_convert_timestamp_col(tb_chamado_log_validacao, "dh_fim")



In [0]:
from delta.tables import DeltaTable
from pyspark.sql import SparkSession

deltaTable = DeltaTable.forName(spark, nome_tabela_destino)

deltaTable.alias("target") \
    .merge(
        source=tb_chamado_log_validacao.alias("source"),
        condition=f"target.cd_chamado = source.cd_chamado"
    ) \
    .whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()
