# ECRED SIMULATIONS NEW - ETL

### Notebook Objective

In this notebook We will:
1. Ingest data from bronze to silver
2. Define some DLT methods that can be used programmatically to create objects in the pipeline

In [None]:
%pip install /dbfs/FileStore/ecs_dataops_lib_data_quality-0.0.1-py3-none-any.whl
%pip install /dbfs/FileStore/ecs_dataops_utils_cryptography-0.12-py3-none-any.whl


In [None]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from datetime import date, datetime, timedelta
from cryptography_library.operations.commons import PlatformCryptographyClient
from cryptography_library.operations import SparkCryptography
#from DataQuality import *

In [None]:
encriptLake     = SparkCryptography(spark)
decriptPlatform = PlatformCryptographyClient()

ecredDecrypt = decriptPlatform.get_decrypt_udf(
      product="ecred",
      table="simulations"
)

encryptCPF           = encriptLake.get_encryption_function("cpf")
encryptEmail         = encriptLake.get_encryption_function("email")
encryptName          = encriptLake.get_encryption_function("name")
encryptPhone         = encriptLake.get_encryption_function("phone")

encryptAddress       = encriptLake.get_encryption_function("address")
encryptDate          = encriptLake.get_encryption_function("date")
encryptPersonNumber  = encriptLake.get_encryption_function("person_number")
encryptGeoCode       = encriptLake.get_encryption_function("geocode")

In [None]:
bucketStage       = "serasaexperian-ecs-datalakehouse-prd-stage"
bucketBronze      = "serasaexperian-ecs-datalakehouse-prd-bronze"
bucketSilver      = "serasaexperian-ecs-datalakehouse-prd-silver"
bucketCheckpoint  = "serasaexperian-ecs-datalakehouse-prd-checkpoints"


silverDMS            = "s3://serasaexperian-ecs-datalakehouse-prd-silver/ecs/ecred/simulations_new/"
bronzePath           = "s3a://{}/ecs/events/ecred/".format(bucketBronze)
silverPath           = "s3://{}/ecs/ecred/events_new_simulation/".format(bucketSilver)
stagePath            = "s3://{}/ecred/events_simulation/".format(bucketStage)

stageCheckpoint      = "s3://{}/ecs/ecred/events_new_simulation/stage/".format(bucketCheckpoint)
silverCheckpoint     = "s3://{}/ecs/ecred/events_new_simulation/silver/".format(bucketCheckpoint)
silverCheckpointInc  = "s3://{}/ecs/ecred/events_new_simulation_inc/silver/".format(bucketCheckpoint)

tableName            = "db_ecred_silver.tb_new_events_simulation"
namedStageStream     = "new_simulation_stream_write_bronze_to_stage"
namedSilverStream    = "new_simulation_stream_write_bronze_to_silver"
namedSilverIniStream = "new_simulation_stream_write_ini_bronze_to_silver"

print(stagePath)
print(bronzePath)
print(silverPath)

#### Getting the initial run as a pipeline parameter instead

In [None]:
initial_run = spark.conf.get("pipeline.initial_run") 
if initial_run:
  date_ = '2023-10-01'
  date_load = '2023-09-30'
  try:
    dbutils.fs.rm(silverPath, True)
  except Exception as e:
    print(e)
else:
  date_ = (date.today() - timedelta(days = 2)).strftime("%Y-%m-%d")
  date_load = (date.today() - timedelta(days = 3)).strftime("%Y-%m-%d")
date_

#### Object Schema definition

In [None]:
offers_schema = ArrayType(StructType([
    StructField("partnerProposalId", StringType()),
    StructField("creditTypeKey", StringType()),
    StructField("totalEffectiveCostPercentMonthly", StringType()),
    StructField("totalEffectiveCostPercentAnnualy", StringType()),
    StructField("taxRatePercentMonthly", StringType()),
    StructField("taxRatePercentAnnualy", StringType()),
    StructField("taxCreditOperationPercent", StringType()),
    StructField("flag", StringType()),
    StructField("detail", StringType()),
    StructField("creditTypeKeyV2", StringType()),
    StructField("feeAnnually", StringType()),
    StructField("feeAnnuallyDescription", StringType()),
    StructField("feeAnnuallyAdditionalCard", StringType()),
    StructField("feeAnnuallyDescriptionAdditionalCard", StringType()),
    StructField("feeCreditOpening", StringType()),
    StructField("installments", StringType()),
    StructField("installmentsValue", StringType()),
    StructField("preApproved", StringType()),
    StructField("approvalRate", StringType()),
    StructField("coldOffer", StringType()),
    StructField("totalPayable", StringType()),
    StructField("value", StringType()),
    StructField("differential", StringType()),
    StructField("variant", StringType()),
    StructField("limit", StringType()),
    StructField("description", StringType()),
    StructField("partnerIntegrationData", ArrayType(StructType([
        StructField("redirectUrl", StringType()),
        StructField("productCode", StringType()),
        StructField("secureCode", StringType())
    ]))),
    StructField("benefits", ArrayType(StringType())),
    StructField("productBenefits", ArrayType(StructType([
        StructField("id", StringType()),
        StructField("categoryName", StringType()),
        StructField("categoryId", StringType()),
        StructField("description", StringType())
    ]))),
    StructField("partnerKey", StringType()),
    StructField("partnerId", StringType()),
    StructField("id", StringType()),
    StructField("codeFgts", StringType()),
    StructField("detailFgts", StringType()),
    StructField("type", StringType()),
    StructField("productId", StringType()),
    StructField("offerType", StringType()),
    StructField("totalLimit", StringType()),
    StructField("digitalAccount", StringType()),
    StructField("monthlyFeeDescription", StringType()),
    StructField("monthlyFee", StringType()),
    StructField("creditCardLastFourDigits", StringType()),
    StructField("withdrawalMaxValue", StringType()),
    StructField("withdrawalMinValue", StringType()),
    StructField("firstInstallmentValue", StringType()),
    StructField("productKey", StringType()),
    StructField("partnerImportance", StringType())
]))

products_schema =  ArrayType(StructType([StructField("id", StringType()),
                                         StructField("description", StringType()),
                                         StructField("businessValue", StringType()),
                                         StructField("business_value", StringType()),
                                         StructField("business_name", StringType()),
                                         StructField("partner_key", StringType()),
                                         StructField("partner_id", StringType()),
                                         StructField("partnerKey", StringType()),
                                         StructField("creditTypeKeyV2", StringType()),
                                         StructField("partnerId", StringType()),
                                         StructField("product_key", StringType()),
                                         StructField("importance", StringType()),
                                         StructField("offer_returned", StringType()),
                                         StructField("total_process_time", StringType()),
                                         StructField("reason_for_not_offering", StringType()),
                                         StructField("integrator_call_time", StringType()),
                                         StructField("integrator_status_code", StringType()),
                                         StructField("custom_scores_calls", StringType()),
                                         StructField("CAIP", StructType([
                                                             StructField("call_time", StringType()),
                                                             StructField("success", StringType()),
                                                             StructField("status_code", StringType())])),
                                         StructField("called", StringType())]))

ls_filters_schema =  StructType([StructField("creditTypeKeyV2", StringType()),
                                         StructField("eligible", StringType()),
                                         StructField("errorMessage", StringType()),
                                         StructField("filterResultAddressCity", StringType()),
                                         StructField("filterResultAddressState", StringType()),
                                         StructField("filterResultAge", StringType()),
                                         StructField("filterResultIncome", StringType()),
                                         StructField("filterResultLocation", StringType()),
                                         StructField("filterResultNegativated", StringType()),
                                         StructField("filterResultOccupation", StringType()),
                                         StructField("filterResultPreElegible", StringType()),
                                         StructField("filterResultProfession", StringType()),
                                         StructField("filterResultScore", StringType()),
                                         StructField("filterResultScoreECARD", StringType()),
                                         StructField("filterResultScoreHRP9", StringType()),
                                         StructField("filterResultScoreHSPN", StringType()),
                                         StructField("filterResultScoreHVAR", StringType()),
                                         StructField("filterResultScoreRCEP", StringType()),
                                         StructField("filtersAppliedBy", StringType()),
                                         StructField("nogordPolicyUsed", StringType()),
                                         StructField("nogordRequestUrl", StringType()),
                                         StructField("nogordResponseStatusCode", StringType()),
                                         StructField("nogordResponseStatusMessage", StringType()),
                                         StructField("nogordResponseTime", StringType()),
                                         StructField("offerHallGatherId", StringType()),
                                         StructField("offerHallGatherKey", StringType()),
                                         StructField("offerHallGatherTitle", StringType()),
                                         StructField("partnerId", StringType()),
                                         StructField("partnerKey", StringType()),
                                         StructField("productId", StringType()),
                                         StructField("productKey", StringType()),
                                         StructField("profileId", StringType())])



In [None]:
dataSchema   = StructType().add("id", StringType()) \
                           .add("requestId", StringType()) \
                           .add("reason", StringType()) \
                           .add("installments", IntegerType()) \
                           .add("value", DoubleType()) \
                           .add("vehicleGuarantee", StringType()) \
                           .add("propertyGuarantee", StringType()) \
                           .add("prioritizerFlowFlag", IntegerType()) \
                           .add("clusterLabel", StringType())\
                           .add("clusterPrefix", StringType())\
                           .add("clusterRating", StringType())\
                           .add("clusterMacroRating", StringType())\
                           .add("addressZipCode", StringType())\
                           .add("occupationQuestionId", StringType())\
                           .add("assignmentId", StringType()) \
                           .add("adjusted", StringType()) \
                           .add("statusCode", StringType()) \
                           .add("success", StringType()) \
                           .add("totalProcessTime", StringType()) \
                           .add("consumerAddressZipCode", StringType()) \
                           .add("consumerId", StringType()) \
                           .add("consumerAge", StringType()) \
                           .add("consumerBirthDate", StringType()) \
                           .add("consumerCreatedAt", StringType()) \
                           .add("consumerScoresECARD", StringType()) \
                           .add("consumerScoresratingHSPN", StringType()) \
                           .add("flowAsync", StringType()) \
                           .add("requests", StringType()) \
                           .add("suggestedSimulation", StringType()) \
                           .add("consumerCpf", StringType()) \
                           .add("consumerName", StringType()) \
                           .add("consumerEmail", StringType()) \
                           .add("consumerEmailVerifiedAt", StringType()) \
                           .add("consumerCellphone", StringType()) \
                           .add("consumerCellphoneVerifiedAt", StringType()) \
                           .add("professionLabel", StringType()) \
                           .add("professionValue", StringType()) \
                           .add("professionSequence", StringType()) \
                           .add("occupationLabel", StringType()) \
                           .add("occupationValue", StringType()) \
                           .add("occupationSequence", StringType()) \
                           .add("professionId", StringType()) \
                           .add("professionQuestionId", StringType()) \
                           .add("addressNumber", StringType()) \
                           .add("paydayQuestionId", StringType()) \
                           .add("paydayId", StringType()) \
                           .add("paydayLabel", StringType()) \
                           .add("paydayValue", StringType()) \
                           .add("paydaySequence", StringType()) \
                           .add("birthStateId", StringType()) \
                           .add("birthStateQuestionId", StringType()) \
                           .add("birthStateLabel", StringType()) \
                           .add("birthStateValue", StringType()) \
                           .add("birthStateSequence", StringType()) \
                           .add("addressComplement", StringType()) \
                           .add("income", StringType()) \
                           .add("placeBirth", StringType()) \
                           .add("genderId", StringType()) \
                           .add("genderLabel", StringType()) \
                           .add("genderValue", StringType()) \
                           .add("genderSequence", StringType()) \
                           .add("addressNeighborhood", StringType()) \
                           .add("addressCity", StringType()) \
                           .add("maritalStatusId", StringType()) \
                           .add("maritalStatusQuestionId", StringType()) \
                           .add("maritalStatusLabel", StringType()) \
                           .add("maritalStatusValue", StringType()) \
                           .add("maritalStatusSequence", StringType()) \
                           .add("addressStateId", StringType()) \
                           .add("addressStateQuestionId", StringType()) \
                           .add("addressStateLabel", StringType()) \
                           .add("addressStateValue", StringType()) \
                           .add("addressStateSequence", StringType()) \
                           .add("rg", StringType()) \
                           .add("address", StringType()) \
                           .add("residenceTypeId", StringType()) \
                           .add("residenceTypeQuestionId", StringType()) \
                           .add("residenceTypeLabel", StringType()) \
                           .add("residenceTypeValue", StringType()) \
                           .add("residenceTypeSequence", StringType()) \
                           .add("offers", ArrayType(StructType().add("partnerProposalId", StringType())\
                                                                .add("creditTypeKey", StringType())\
                                                                .add("totalEffectiveCostPercentMonthly", StringType())\
                                                                .add("totalEffectiveCostPercentAnnualy", StringType())\
                                                                .add("taxRatePercentMonthly", StringType())\
                                                                .add("taxRatePercentAnnualy", StringType())\
                                                                .add("taxCreditOperationPercent", StringType())\
                                                                .add("flag", StringType())\
                                                                .add("detail", StringType())\
                                                                .add("feeAnnually", StringType())\
                                                                .add("feeAnnuallyDescription", StringType())\
                                                                .add("feeAnnuallyAdditionalCard", StringType())\
                                                                .add("feeAnnuallyDescriptionAdditionalCard", StringType())\
                                                                .add("feeCreditOpening", StringType())\
                                                                .add("creditTypeKeyV2", StringType()) \
                                                                .add("installments", StringType())\
                                                                .add("installmentsValue", StringType())\
                                                                .add("preApproved", StringType())\
                                                                .add("approvalRate", StringType())\
                                                                .add("coldOffer", StringType())\
                                                                .add("totalPayable", StringType())\
                                                                .add("value", StringType())\
                                                                .add("differential", StringType())\
                                                                .add("variant", StringType())\
                                                                .add("limit", StringType())\
                                                                .add("description", StringType())\
                                                                .add("partnerIntegrationData", ArrayType(StructType()
                                                                    .add("redirectUrl", StringType())\
                                                                    .add("productCode", StringType())\
                                                                    .add("secureCode", StringType())))\
                                                                .add("benefits", ArrayType(StringType()))\
                                                                .add("productBenefits", ArrayType(StructType()
                                                                    .add("id", StringType())\
                                                                    .add("categoryName", StringType())\
                                                                    .add("categoryId", StringType())\
                                                                    .add("description", StringType())))\
                                                                .add("partnerKey", StringType())\
                                                                .add("partnerId", StringType())\
                                                                .add("type", StringType())\
                                                                .add("id", StringType())\
                                                                .add("offerType", StringType())\
                                                                .add("codeFgts", StringType())\
                                                                .add("detailFgts", StringType())\
                                                                .add("productId", StringType())\
                                                                .add("totalLimit", StringType())\
                                                                .add("digitalAccount", StringType())\
                                                                .add("monthlyFeeDescription", StringType())\
                                                                .add("monthlyFee", StringType())\
                                                                .add("creditCardLastFourDigits", StringType())\
                                                                .add("withdrawalMaxValue", StringType())\
                                                                .add("withdrawalMinValue", StringType())\
                                                                .add("firstInstallmentValue", StringType())\
                                                                .add("productKey", StringType())\
                                                                .add("partnerImportance", StringType())))\
                           .add("consumerIncome", StringType()) \
                           .add("duplicated_slots", StringType()) \
                           .add("offers_importance", StringType()) \
                           .add("consumerScoresERCARD", StringType()) \
                           .add("consumerScoresERCRA", StringType()) \
                           .add("consumerScoresHRP9", StringType()) \
                           .add("consumerScoresHSPA", StringType()) \
                           .add("consumerScoresHSPN", StringType()) \
                           .add("consumerScoresRCEP", StringType()) \
                           .add("consumerRatingECARD", StringType()) \
                           .add("consumerRatingERCRA", StringType()) \
                           .add("consumerRatingRCEP", StringType()) \
                           .add("consumerRatingHSPN", StringType()) \
                           .add("consumerNegativated", StringType()) \
                           .add("offerHallGatherKey", StringType()) \
                           .add("products", ArrayType(StructType().add("id", StringType()) \
                                                                  .add("description", StringType()) \
                                                                  .add("businessValue", StringType()) \
                                                                  .add("business_value", StringType()) \
                                                                  .add("business_name", StringType()) \
                                                                  .add("partner_key", StringType()) \
                                                                  .add("partner_id", StringType()) \
                                                                  .add("partnerKey", StringType()) \
                                                                  .add("partnerId", StringType()) \
                                                                  .add("product_key", StringType()) \
                                                                  .add("importance", StringType()) \
                                                                  .add("offer_returned", StringType()) \
                                                                  .add("total_process_time", StringType()) \
                                                                  .add("reason_for_not_offering", StringType()) \
                                                                  .add("integrator_call_time", StringType()) \
                                                                  .add("integrator_status_code", StringType()) \
                                                                  .add("custom_scores_calls", StringType()) \
                                                                  .add("CAIP", StructType().add("call_time", StringType()) \
                                                                                           .add("success", StringType()) \
                                                                                           .add("status_code", StringType())) \
                                                                  .add("called", StringType()))) \
                           .add("failedActionType", StringType()) \
                           .add("failedActionMessage", StringType()) \
                           .add("failedActionApplication", StringType()) \
                           .add("creditTypeKeyV2", StringType()) \
                           .add("productId", StringType()) \
                           .add("failedActionStatuscode", StringType())\
                           .add("eligible", StringType())\
                           .add("errorMessage", StringType())\
                           .add("filterResultAddressCity", StringType())\
                           .add("filterResultAddressState", StringType())\
                           .add("filterResultAge", StringType())\
                           .add("filterResultIncome", StringType())\
                           .add("filterResultLocation", StringType())\
                           .add("filterResultNegativated", StringType())\
                           .add("filterResultOccupation", StringType())\
                           .add("filterResultPreElegible", StringType())\
                           .add("filterResultProfession", StringType())\
                           .add("filterResultScore", StringType())\
                           .add("filterResultScoreECARD", StringType())\
                           .add("filterResultScoreHRP9", StringType())\
                           .add("filterResultScoreHSPN", StringType())\
                           .add("filterResultScoreHVAR", StringType())\
                           .add("filterResultScoreRCEP", StringType())\
                           .add("filtersAppliedBy", StringType())\
                           .add("nogordPolicyUsed", StringType())\
                           .add("nogordRequestUrl", StringType())\
                           .add("nogordResponseStatusCode", StringType())\
                           .add("nogordResponseStatusMessage", StringType())\
                           .add("nogordResponseTime", StringType())\
                           .add("offerHallGatherId", StringType())\
                           .add("offerHallGatherTitle", StringType())\
                           .add("profileId", StringType())

In [None]:
spark_conf={
        "spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite" : True,
        "spark.databricks.delta.properties.defaults.autoOptimize.autoCompact" :  True,
        "spark.databricks.delta.optimizeWrite.enabled": True,
        "spark.databricks.adaptive.autoOptimizeShuffle.enabled": True,
        "spark.databricks.adaptive.skewJoin.spillProof.enabled": True,
        "spark.sql.adaptive.skewJoin.enabled": True,
        "spark.databricks.delta.cache.enabled": True,
        "spark.databricks.io.cache.enabled": True        
}


#### DLT METHODS

In [None]:
from dataclasses import asdict, dataclass, field
from typing import Any, Callable, Dict, List, Union, Optional, Dict, Literal
import dlt

QUARANTINE_COL: str = "is_quarantined"

def can_quarantine(is_quarantined: bool, expect_all: Dict[str, str]) -> bool:
    quarantine: bool = is_quarantined and bool(expect_all) and not has_scd(entity)
    #logger.debug(f"Can quarantine: {quarantine}")
    return quarantine


def quarantine_rules(is_quarantined: bool, expect_all: Dict[str, str]) -> str:
    rules: str = (
        "NOT({0})".format(" AND ".join(expect_all.values()))
        if can_quarantine(is_quarantined, expect_all)
        else "1=0"
    )
    #logger.debug(f"Quarantine rules: {rules}")
    return rules

In [None]:
def generate_view(input_df: DataFrame, destination: str, comment: str, expect_all: Dict[str, str], expect_all_or_drop: Dict[str, str], expect_all_or_fail: Dict[str, str]) -> None:
  """
  Generates a view for the specified dataframe.
  """
  #logger.info(f"Generating view for entity: ")
  @dlt.view(
      name=destination,
      comment=comment,
      spark_conf=spark_conf
  )
  @dlt.expect_all(expectations=expect_all)
  @dlt.expect_all_or_drop(expectations=expect_all_or_drop)
  @dlt.expect_all_or_fail(expectations=expect_all_or_fail)
  def _():
    return input_df


In [None]:
def generate_table(input_df: DataFrame, destination: str, partition_cols: List[str], comment: str, expect_all: Dict[str, str], expect_all_or_drop: Dict[str, str], expect_all_or_fail: Dict[str, str], is_quarantined: bool, has_scd: bool, src: str, keys: List[str], sequence_by: str, ignore_null_updates: bool, apply_as_deletes: str) -> None:
    """
    Generates a table for the specified entity.
    """
    #logger.info(f"Generating table for entity: ")
    name: str = destination
    quarantine_name: str = f"{name}_quarantine"
    invalid_name: str = f"{name}_invalid"
    if can_quarantine(is_quarantined, expect_all):
        _create_quarantine_tables(
            valid_name=name,
            invalid_name=invalid_name,
            quarantine_name=quarantine_name,
            partition_cols = partition_cols)
        name = quarantine_name
        partition_cols = [QUARANTINE_COL] + partition_cols
    if has_scd:
        _create_scd_table(source=src, keys=keys, sequence_by=sequence_by, ignore_null_updates=False, apply_as_deletes=apply_as_deletes, name=name, partition_cols=partition_cols, comment=comment, expect_all=expect_all, expect_all_or_drop=expect_all_or_drop, expect_all_or_fail=expect_all_or_fail)
    else:
        _create_table(input_df, name, partition_cols, comment, expect_all, expect_all_or_drop, expect_all_or_fail, is_quarantined)


In [None]:
def _create_quarantine_tables(
    valid_name: str,
    invalid_name: str,
    quarantine_name: str,
    partition_cols: List[str],
):
  @dlt.table(name=valid_name, partition_cols=partition_cols)
  def valid_data():
    df: DataFrame = (
        dlt.readStream(quarantine_name)
    )
    return df.filter(f"{QUARANTINE_COL}=false").drop(QUARANTINE_COL, "_rescued_data")

  @dlt.table(name=invalid_name, partition_cols=partition_cols)
  def invalid_data():
    df: DataFrame = (
        dlt.readStream(quarantine_name)
    )
    return df.filter(f"{QUARANTINE_COL}=true").drop(QUARANTINE_COL)

In [None]:
def _create_table(input_df: DataFrame, name: str, partition_cols: List[str], comment: str, expect_all: Dict, expect_all_or_drop: Dict, expect_all_or_fail: Dict, is_quarantined: bool):
  #logger.debug(f"Creating table: {name}")
  is_temporary: bool = is_quarantined
  @dlt.table(
    name=name,
    comment=comment,
    partition_cols=partition_cols,
    spark_conf=spark_conf,
    temporary=is_temporary,
  )
  @dlt.expect_all(expectations=expect_all)
  @dlt.expect_all_or_drop(expectations=expect_all_or_drop)
  @dlt.expect_all_or_fail(expectations=expect_all_or_fail)
  def target_table():
    df: DataFrame = input_df
    if can_quarantine(is_quarantined, expect_all):
      rules: str = quarantine_rules(is_quarantined, expect_all)
      df = df.withColumn(QUARANTINE_COL, F.expr(rules))
    return df

In [None]:
def _create_scd_table(source:str, keys: List[str], sequence_by: str, ignore_null_updates: bool, apply_as_deletes: str, name: str, partition_cols: List[str], comment: str, expect_all: Dict, expect_all_or_drop: Dict, expect_all_or_fail: Dict
):
  #logger.debug(f"Creating SCD table: {name}")
  dlt.create_streaming_table(
      name=name,
      comment=comment,
      partition_cols=partition_cols,
      spark_conf=spark_conf,
      expect_all=expect_all,
      expect_all_or_drop=expect_all_or_drop,
      expect_all_or_fail=expect_all_or_fail,
  )
  dlt.apply_changes(
      target=name,
      source=source,
      keys=keys,
      sequence_by=entity.apply_changes.sequence_by,
      ignore_null_updates=ignore_null_updates,
      apply_as_deletes=apply_as_deletes
  )

In [None]:
df_dms = spark.read.load(silverDMS)\
                   .where(f"cast(dt_created_at as date) < '{date_}'")\
                   .selectExpr("cd_simulation"
                              ,"dt_created_at"
                              ,"vehicle_guarantee"
                              ,"property_guarantee"
                              ,"vl_simulation"
                              ,"qt_installments"
                              ,"adjusted"
                              ,"prioritizer_flow_flag"
                              ,"ds_client"
                              ,"cd_uuid"
                              ,"consumer_birth_date"
                              ,"consumer_cellphone"
                              ,"consumer_cellphone_verified_at"
                              ,"consumer_cpf"
                              ,"consumer_created_at"
                              ,"consumer_data_address_zip_code"
                              ,"consumer_data_consumer_id"
                              ,"consumer_email"
                              ,"consumer_email_verified_at"
                              ,"consumer_name"
                              ,"consumer_user_data_created_at"
                              ,"consumer_scores_ERCRA"
                              ,"consumer_presumedIncome"
                              ,"consumer_age"
                              ,"consumer_assignment_cluster_name"
                              ,"consumer_assignment_cluster_prefix"
                              ,"consumer_assignment_cluster_rating"
                              ,"consumer_assignment_cluster_macro_rating"
                              ,"consumer_assignment_id"
                              ,"questions_address_zip_code"
                              ,"questions_occupation_questionId"
                              ,"questions_occupation_label"
                              ,"questions_occupation_value"
                              ,"questions_occupation_sequence"
                              ,"questions_address"
                              ,"questions_profession_id"
                              ,"questions_profession_questionId"
                              ,"questions_profession_label"
                              ,"questions_profession_value"
                              ,"questions_profession_sequence"
                              ,"questions_address_number"
                              ,"questions_payday_id"
                              ,"questions_payday_questionId"
                              ,"questions_payday_label"
                              ,"questions_payday_value"
                              ,"questions_payday_sequence"
                              ,"questions_birth_state_id"
                              ,"questions_birth_state_questionId"
                              ,"questions_birth_state_label"
                              ,"questions_birth_state_value"
                              ,"questions_birth_state_sequence"
                              ,"questions_address_complement"
                              ,"questions_income"
                              ,"questions_place_birth"
                              ,"questions_gender_questionId"
                              ,"questions_gender_label"
                              ,"questions_gender_value"
                              ,"questions_gender_sequence"
                              ,"questions_address_neighborhood"
                              ,"questions_address_city"
                              ,"questions_marital_status_id"
                              ,"questions_marital_status_questionId"
                              ,"questions_marital_status_label"
                              ,"questions_marital_status_value"
                              ,"questions_marital_status_sequence"
                              ,"questions_address_state_id"
                              ,"questions_address_state_questionId"
                              ,"questions_address_state_label"
                              ,"questions_address_state_value"
                              ,"questions_address_state_sequence"
                              ,"questions_rg"
                              ,"questions_residence_type_id"
                              ,"questions_residence_type_questionId"
                              ,"questions_residence_type_label"
                              ,"questions_residence_type_value"
                              ,"questions_residence_type_sequence"
                              ,"reason_key"
                              ,"ls_partners"
                              ,"ls_offers"
                              ,"ls_products"
                              ,"dt_load_bronze"
                              ,"cast(dt_created_at as date) as dt_event"
                              ,"cast(current_timestamp() as date) as dt_load_silver"
                              ,"file_name_raw"
                              ,"cast(null as string) as consumer_scores_ECARD"
                              ,"cast(null as string) as consumer_scores_HRP9"
                              ,"cast(null as string) as consumer_scores_HSPA"
                              ,"cast(null as string) as consumer_scores_HSPN"
                              ,"cast(null as string) as consumer_scores_RCEP"
                              ,"cast(null as string) as consumer_scores_rating_HSPN"
                              ,"cast(null as string) as consumer_rating_RCEP"
                              ,"cast(null as string) as consumer_rating_HSPN"
                              ,"cast(null as string) as consumer_rating_ECARD"
                              ,"cast(null as string) as consumer_rating_ERCRA"
                              ,"cast(null as string) as duplicated_slots"
                              ,"cast(null as string) as offers_importance"
                              ,"cast(null as string) as consumerNegativated"
                              ,"cast(null as string) as offerHallGatherKey"
                              ,"cast(null as timestamp) as ts_event"
                              ,"cast(null as string) as creditTypeKeyV2"
                              ,"from_json(to_json(ls_offers), offers_schema) as ls_offers"
                              ,"from_json(to_json(ls_products), products_schema) as ls_products"
                              ,"from_json(to_json(ls_offers), offers_schema) as ls_offers_events"
                              ,"from_json(to_json(ls_products), products_schema) as ls_products_events"
                              ,"from_json(to_json(null), array<struct<ls_filters_schema>>) as ls_filters_events")

generate_table(input_df= df_dms, destination="silver_dms_tbl", comment="PUT SOME RELEVANT COMMENT ABOUT THIS DATA OBJECT HERE", is_quarantined=False, has_scd=False)

In [None]:
df_bronze = (
    spark.read.load(bronzePath)
              .filter(col("dt_load")>=date_load)
              .where(f"eventCode in ('2600','2601','2602','311') and cast((dateTime - INTERVAL '3' HOUR) as date) >= '{date_}'")
              .select( 
                       "simulationId"
                      ,"clientOrigin"
                      ,"dateTime"
                      ,"userId"
                      ,"eventCode"
                      ,"dt_load_bronze"
                      ,"dt_load"
                      ,"file_name_raw"
                      ,"data"
                      ,from_json("data",dataSchema).alias("new_data")
                    )
            ).withColumn("ts_event", col("dateTime") - expr('INTERVAL 3 HOURS'))
generate_view(input_df= df_bronze, destination="bronze_data_vw", comment="PUT SOME RELEVANT COMMENT HERE")

In [None]:
df_2600 = dlt.readStream("bronze_data_vw").where("eventCode == '2600' and new_data.success == 'true' and new_data.consumerCpf is not null")\
                   .withColumn("row_number",row_number().over(Window.partitionBy(col("simulationId")).orderBy(col("dateTime").desc())))\
                   .filter(col("row_number")==1)

df_2601 = dlt.readStream("bronze_data_vw").where("eventCode == '2601'")\
                   .withColumn("row_number",row_number().over(Window.partitionBy(col("simulationId")).orderBy(col("dateTime").desc())))\
                   .filter(col("row_number")==1)

df_2602 = dlt.readStream("bronze_data_vw").where("eventCode == '2602'")\
                   .withColumn("row_number",row_number().over(Window.partitionBy(col("simulationId")).orderBy(col("dateTime").desc())))\
                   .filter(col("row_number")==1)

df_311 = dlt.readStream("bronze_data_vw").where("eventCode == '311'")\
                   .withColumn("row_number",row_number().over(Window.partitionBy(col("simulationId"), col("new_data.productId")).orderBy(col("dateTime").desc())))\
                   .filter(col("row_number")==1)\
                   .select("simulationId","new_data.productId", "new_data.creditTypeKeyV2", from_json("data",ls_filters_schema).alias("ls_filters_events")) 

In [None]:
df_explode = df_2601.select(
    "*", explode_outer("new_data.products").alias("products_explode")
)

products_fields = df_explode.schema["products_explode"].dataType.names

condicoes = [col(f"products_explode.{campo}").isNull() for campo in products_fields] + [
    col(f"creditTypeKeyV2").isNull()
]

condicao_final = condicoes.pop()
for condicao in condicoes:
    condicao_final = condicao_final & condicao


df_struct_rebuild = (
    df_explode.join(
        df_311,
        [
            (df_311.simulationId == df_explode.simulationId)
            & (df_311.productId == df_explode.products_explode.id)
        ],
        "left",
    )
    .withColumn(
        "products_explode_rebuild",
        when(condicao_final, lit(None)).otherwise(
            struct(
                *(
                    [col("products_explode")[c].alias(c) for c in products_fields]
                    + [col("creditTypeKeyV2").alias("creditTypeKeyV2")]
                )
            )
        ),
    )
    .select(df_explode.simulationId.alias("simulationId_"), "products_explode_rebuild")
)

df_rebuild = df_struct_rebuild.groupBy(col("simulationId_")).agg(
    collect_set(col("products_explode_rebuild")).alias("ls_products")
)

generate_view(input_df= df_rebuild, destination="rebuild_df", comment="PUT SOME RELEVANT COMMENT HERE")

In [None]:
df_events_2600 = df_2600.select(
                        col("simulationId").alias("cd_simulation")
                       ,col("ts_event").alias("dt_created_at")
                       ,col("new_data.vehicleGuarantee").alias("vehicle_guarantee")
                       ,col("new_data.propertyGuarantee").alias("property_guarantee")
                       ,col("new_data.value").alias("vl_simulation")
                       ,col("new_data.installments").alias("qt_installments")
                       ,col("new_data.adjusted").alias("adjusted")
                       ,col("new_data.prioritizerFlowFlag").alias("prioritizer_flow_flag")
                       ,col("clientOrigin").alias("ds_client")
                       ,col("userId").alias("cd_uuid")
                       ,encryptDate(ecredDecrypt(col("new_data.consumerBirthDate"))).alias("consumer_birth_date")
                       ,encryptPhone(ecredDecrypt(col("new_data.consumerCellphone"))).alias("consumer_cellphone")
                       ,col("new_data.consumerCellphoneVerifiedAt").alias("consumer_cellphone_verified_at")
                       ,encryptCPF(ecredDecrypt(col("new_data.consumerCpf"))).alias("consumer_cpf")
                       ,col("new_data.consumerCreatedAt").alias("consumer_created_at")
                       ,encryptGeoCode(ecredDecrypt(col("new_data.consumerAddressZipCode"))).alias("consumer_data_address_zip_code")
                       ,col("new_data.consumerId").alias("consumer_data_consumer_id")
                       ,encryptEmail(ecredDecrypt(col("new_data.consumerEmail"))).alias("consumer_email")
                       ,col("new_data.consumerEmailVerifiedAt").alias("consumer_email_verified_at")
                       ,encryptName(ecredDecrypt(col("new_data.consumerName"))).alias("consumer_name")
                       ,col("new_data.consumerCreatedAt").alias("consumer_user_data_created_at")
                       ,col("new_data.consumerScoresERCRA").alias("consumer_scores_ERCRA")
                       ,col("new_data.consumerIncome").alias("consumer_presumedIncome")
                       ,col("new_data.consumerAge").alias("consumer_age")
                       ,col("new_data.clusterLabel").alias("consumer_assignment_cluster_name")
                       ,col("new_data.clusterPrefix").alias("consumer_assignment_cluster_prefix")
                       ,col("new_data.clusterRating").alias("consumer_assignment_cluster_rating")
                       ,col("new_data.clusterMacroRating").alias("consumer_assignment_cluster_macro_rating")
                       ,col("new_data.assignmentId").alias("consumer_assignment_id")
                       ,col("new_data.reason").alias("reason_key")
                       ,col("new_data.consumerScoresECARD").alias("consumer_scores_ECARD")
                       ,col("new_data.consumerScoresHRP9").alias("consumer_scores_HRP9")
                       ,col("new_data.consumerScoresHSPA").alias("consumer_scores_HSPA")
                       ,col("new_data.consumerScoresHSPN").alias("consumer_scores_HSPN")
                       ,col("new_data.consumerScoresRCEP").alias("consumer_scores_RCEP")
                       ,col("new_data.consumerScoresratingHSPN").alias("consumer_scores_rating_HSPN")
                       ,col("new_data.consumerRatingECARD").alias("consumer_rating_ECARD")
                       ,col("new_data.consumerRatingERCRA").alias("consumer_rating_ERCRA")
                       ,col("new_data.consumerRatingRCEP").alias("consumer_rating_RCEP")
                       ,col("new_data.consumerRatingHSPN").alias("consumer_rating_HSPN")
                       ,col("dt_load_bronze")
                       ,col("file_name_raw")
                       ,col("new_data.consumerNegativated").alias("consumerNegativated")
                       ,col("new_data.offerHallGatherKey").alias("offerHallGatherKey")
                          )

In [None]:
df_events_2601 = df_2601.select(col("simulationId").alias("cd_simulation_2601")
                               ,col("new_data.offers").alias("ls_offers")
                               ,col("new_data.duplicated_slots").alias("duplicated_slots")
                               ,col("new_data.offers_importance").alias("offers_importance")
                               ,col("new_data.creditTypeKeyV2").alias("creditTypeKeyV2")
                               )

In [None]:
df_events_2602 = df_2602.select(col("simulationId").alias("cd_simulation_2602")
                               ,encryptGeoCode(ecredDecrypt(col("new_data.addressZipCode"))).alias("questions_address_zip_code")
                               ,col("new_data.occupationQuestionId").alias("questions_occupation_questionId")
                               ,col("new_data.occupationLabel").alias("questions_occupation_label")
                               ,col("new_data.occupationValue").alias("questions_occupation_value")
                               ,col("new_data.occupationSequence").alias("questions_occupation_sequence")
                               ,encryptAddress(ecredDecrypt(col("new_data.address"))).alias("questions_address")
                               ,col("new_data.professionId").alias("questions_profession_id")
                               ,col("new_data.professionQuestionId").alias("questions_profession_questionId")
                               ,col("new_data.professionLabel").alias("questions_profession_label")
                               ,col("new_data.professionValue").alias("questions_profession_value")
                               ,col("new_data.professionSequence").alias("questions_profession_sequence")
                               ,col("new_data.addressNumber").alias("questions_address_number")
                               ,col("new_data.paydayId").alias("questions_payday_id")
                               ,col("new_data.paydayQuestionId").alias("questions_payday_questionId")
                               ,col("new_data.paydayLabel").alias("questions_payday_label")
                               ,col("new_data.paydayValue").alias("questions_payday_value")
                               ,col("new_data.paydaySequence").alias("questions_payday_sequence")
                               ,col("new_data.birthStateId").alias("questions_birth_state_id")
                               ,col("new_data.birthStateQuestionId").alias("questions_birth_state_questionId")
                               ,col("new_data.birthStateLabel").alias("questions_birth_state_label")
                               ,col("new_data.birthStateValue").alias("questions_birth_state_value")
                               ,col("new_data.birthStateSequence").alias("questions_birth_state_sequence")
                               ,encryptAddress(ecredDecrypt(col("new_data.addressComplement"))).alias("questions_address_complement")
                               ,col("new_data.income").alias("questions_income")
                               ,col("new_data.placeBirth").alias("questions_place_birth")
                               ,col("new_data.genderId").alias("questions_gender_questionId")
                               ,col("new_data.genderLabel").alias("questions_gender_label")
                               ,col("new_data.genderValue").alias("questions_gender_value")
                               ,col("new_data.genderSequence").alias("questions_gender_sequence")
                               ,col("new_data.addressNeighborhood").alias("questions_address_neighborhood")
                               ,col("new_data.addressCity").alias("questions_address_city")
                               ,col("new_data.maritalStatusId").alias("questions_marital_status_id")
                               ,col("new_data.maritalStatusQuestionId").alias("questions_marital_status_questionId")
                               ,col("new_data.maritalStatusLabel").alias("questions_marital_status_label")
                               ,col("new_data.maritalStatusValue").alias("questions_marital_status_value")
                               ,col("new_data.maritalStatusSequence").alias("questions_marital_status_sequence")
                               ,col("new_data.addressStateId").alias("questions_address_state_id")
                               ,col("new_data.addressStateQuestionId").alias("questions_address_state_questionId")
                               ,col("new_data.addressStateLabel").alias("questions_address_state_label")
                               ,col("new_data.addressStateValue").alias("questions_address_state_value")
                               ,col("new_data.addressStateSequence").alias("questions_address_state_sequence")
                               ,encryptCPF(ecredDecrypt(col("new_data.rg"))).alias("questions_rg")
                               ,col("new_data.residenceTypeId").alias("questions_residence_type_id")
                               ,col("new_data.residenceTypeQuestionId").alias("questions_residence_type_questionId")
                               ,col("new_data.residenceTypeLabel").alias("questions_residence_type_label")
                               ,col("new_data.residenceTypeValue").alias("questions_residence_type_value")
                               ,col("new_data.residenceTypeSequence").alias("questions_residence_type_sequence")
                               )

In [None]:
df_events_311 = df_311.select(col("simulationId").alias("cd_simulation_311")
                               ,col("ls_filters_events").alias("ls_filters_events")
                               ).groupBy("cd_simulation_311").agg(collect_list("ls_filters_events").alias("ls_filters_events"))

In [None]:
df_2600_2601 = df_events_2600.join(df_events_2601, df_events_2600.cd_simulation == df_events_2601.cd_simulation_2601, "left")
df_all       = df_2600_2601.join(df_events_2602,   df_2600_2601.cd_simulation   == df_events_2602.cd_simulation_2602, "left")\
                          .join(df_rebuild, df_events_2600.cd_simulation == df_rebuild.simulationId_,"left")\
                          .join(df_events_311, df_2600_2601.cd_simulation == df_events_311.cd_simulation_311,"left") #adiciobei um novo join

df_events    = df_all.drop("cd_simulation_2601","cd_simulation_2602", "simulationId_","cd_simulation_311")\
                     .withColumn("ls_offers_events",col("ls_offers"))\
                     .withColumn("ls_products_events",col("ls_products"))\
                      .withColumn("ls_partners",
                               array(struct(
                                            lit(None).cast("string").alias('userId')
                                           ,lit(None).cast("string").alias('name')
                                           ,lit(None).cast("string").alias('businessName')
                                           ,lit(None).cast("string").alias('status')
                                           ,lit(None).cast("string").alias('foundationDate')
                                           ,lit(None).cast("string").alias('disclaimer')
                                           ,lit(None).cast("string").alias('authorizationDisclaimer')
                                           ,lit(None).cast("string").alias('key')
                                           ,array(struct(
                                                         lit(None).cast("string").alias('productId')
                                                        ,lit(None).cast("string").alias('startDate')
                                                        ,lit(None).cast("string").alias('endDate')
                                                        ,lit(None).cast("string").alias('active')
                                                        ,lit(None).cast("string").alias('id'))).alias("contracts")
                                          ,lit(None).cast("string").alias('id')
                                          ,array(struct(
                                                         lit(None).cast("string").alias('key')
                                                        ,lit(None).cast("string").alias('description')
                                                        ,lit(None).cast("string").alias('value')
                                                        ,lit(None).cast("string").alias('active')
                                                        ,lit(None).cast("string").alias('type')
                                                        ,lit(None).cast("string").alias('origin'))).alias("settings")
                                         ,array(
                                                lit(None).cast("string")).alias('matches')
                                         ,struct(
                                                 lit(None).cast("string").alias('name')
                                                ,lit(None).cast("string").alias('prefix')
                                                ,lit(None).cast("string").alias('rating')
                                                ,lit(None).cast("string").alias('macro_rating')).alias("cluster")
                                         ,struct(
                                                 lit(None).cast("string").alias('origin')
                                                ,lit(None).cast("string").alias('value')).alias("approvalPropensity")
                                          ,lit(None).cast("string").alias('businessValue')
                                          ,lit(None).cast("string").alias('called'))))\
                     .withColumn("dt_load_silver",current_timestamp().cast("date"))\
                     .withColumn("ts_event",col("dt_created_at"))\
                     .withColumn("dt_event",col("dt_created_at").cast("date"))
generate_table(input_df=df_events, destination="df_events_tbl", comment="PUT SOME RELEVANT COMMENT ABOUT THIS DATA OBJECT HERE", is_quarantined=True, expect_all={'some_condition: "ls_offers_events is not null" '})

In [None]:
if initial_run:
  spark.sql("DROP TABLE IF EXISTS db_ecred_silver.tb_new_events_simulation")

generate_table(destination='tb_new_events_simulation', src='df_events_tbl', has_scd=True, is_quarantined=False, keys=["cd_simulation", "dt_created_at"], sequence_by='dt_created_at', comment="PUT SOME RELEVANT COMMENT ABOUT THIS DATA OBJECT HERE")
print("Merge OK")