In [0]:
import sys
import os

sys.path.append(os.path.abspath('..'))

from src.utilities import *

In [0]:
PATH_INBOX_EXP = f"{BASE_VOLUME_PATH}/Pendientes/Expuestos"
PATH_INBOX_CONT = f"{BASE_VOLUME_PATH}/Pendientes/Contratantes"

PATH_PROCESSED_EXP = f"{BASE_VOLUME_PATH}/Procesados/Expuestos"
PATH_PROCESSED_CONT = f"{BASE_VOLUME_PATH}/Procesados/Contratantes"

TABLE_BRONZE_EXP = "bronze_dev.sctr_emision.expuestos_bronze"
TABLE_BRONZE_CONT = "bronze_dev.sctr_emision.contratantes_bronze"

COLS_IDX_EXP = [1,2,3,5,6,7,8,9,10,11,12,13,18,19]
COLS_NAM_EXP = ['POLIZA','F_INI_VIGEN_POLIZA','F_FIN_VIGEN_POLIZA','CERTIFICADO','F_INI_COBERT','F_FIN_COBERT',
                'P_NOMBRE','S_NOMBRE','AP_PATERNO','AP_MATERNO','TIPO_DOC','NUM_DOC','YEAR_MOV','MONTH_MOV']

COLS_IDX_CONT = [1,2,3,6,8,9]
COLS_NAM_CONT = ['TIPO_DOC','NUM_DOC_CONT','CONTRATANTE','POLIZA','YEAR_MOV','MONTH_MOV']

open_log("Bronze")

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
logger.info(f"üü† Iniciando proceso Bronze en Databricks. Cl√∫ster: {spark.conf.get('spark.databricks.clusterUsageTags.clusterId')}")

In [0]:
def get_list_files(process_name: str, path_process: str) -> list:
    logger.info(f"   üöÄ Iniciando proceso {process_name}...")

    logger.info(f"   üìÇ Leyendo archivos de carpeta {path_process}...")
    try:
        lista_archivos = dbutils.fs.ls(path_process)
        archivos_excel = [f.path for f in lista_archivos if f.name.endswith(".xlsx")]
    except Exception as e:
        logger.warning(f"   ‚ö†Ô∏è No se pudo listar carpeta. {e}")
        return None

    if not archivos_excel:
        logger.info("   ‚ÑπÔ∏è No hay archivos pendientes.")
        return None

    logger.info(f"   üìÑ Se encontraron {len(archivos_excel)} archivos.")

    return archivos_excel

In [0]:
def ingest_excel_to_df_spark(file_path: str,  cols_ids: list, col_names: list) -> DataFrame:
    # logger.info(f"   üìñ Analizando archivo : {file_path}")
    dfs_sheets = []
    
    try:
        path_local = file_path.replace("dbfs:", "").replace("file:", "")
        # local_path = file_path.replace("dbfs:", "/dbfs")
        inspector = fastexcel.read_excel(path_local)
        sheet_names = inspector.sheet_names
        
        for sheet in sheet_names:
            try:
                df_sheet = (spark.read.format("com.crealytics.spark.excel")
                            .option("header", "true") 
                            .option("inferSchema", "false")
                            .option("dataAddress", f"'{sheet}'!")
                            .option("treatEmptyValuesAsNulls", "true") 
                            # .option("maxByteArraySize", 2147483647)
                            .load(file_path)
                           )
                
                source_cols = df_sheet.columns

                max_idx_needed = max(cols_ids)
                if max_idx_needed >= len(source_cols):
                    logger.warning(f"    ‚ö†Ô∏è Hoja '{sheet}' ignorada: Faltan columnas. Se necesita √≠ndice {max_idx_needed}, pero hay {len(source_cols)} columnas.")
                    continue

                selected_source_cols = [source_cols[i] for i in cols_ids]
                
                df_sheet = df_sheet.select(*[F.col(c) for c in selected_source_cols])

                current_selected_cols = df_sheet.columns
                
                for i, new_col in enumerate(col_names):
                    df_sheet = df_sheet.withColumnRenamed(current_selected_cols[i], new_col)

                df_sheet = df_sheet.na.drop(how="all")
                
                for c in col_names:
                    df_sheet = df_sheet.withColumn(c, F.col(c).cast("string"))
                
                dfs_sheets.append(df_sheet)
            except Exception as e:
                logger.error(f"    üõë Error leyendo hoja '{sheet}' con Spark: {e}")
                continue

        if not dfs_sheets:
            return None

        full_df = dfs_sheets[0]
        for d in dfs_sheets[1:]:
            full_df = full_df.unionAll(d)
            
        full_df = full_df.withColumn("NOMBRE_ARCHIVO", F.lit(Path(file_path).name)) \
                         .withColumn("FECHA_CARGA", F.lit(PERIODO))
        
        return full_df
    except Exception as e:
        logger.error(f"   ‚ùå Error cr√≠tico en archivo {file_path}: {e}")
        return None
    
def ingest_excel_to_df_fastexcel(file_path: str,  cols_ids: list, col_names: list) -> DataFrame:
    # logger.info(f"   üìñ Analizando archivo : {file_path}")
    excel_path = f"./{Path(file_path).parent.name}/{Path(file_path).name}"
    try:
        path_local = file_path.replace("dbfs:", "").replace("file:", "")
        inspector = fastexcel.read_excel(path_local)
        sheet_names = inspector.sheet_names
        dfs_sheets = []
        dtype_map = {idx: "string" for idx in cols_ids}

        for sheet in sheet_names:
            try:
                sheet = inspector.load_sheet_by_name(
                        sheet, 
                        use_columns=cols_ids, 
                        dtypes=dtype_map
                    )
                pdf = sheet.to_pandas()

                source_cols = pdf.columns

                if len(source_cols) != len(col_names):
                    print(f"    ‚ö†Ô∏è Hoja '{sheet}' ignorada: Cantidad de columnas no coincide.")
                    continue
                
                pdf.columns = col_names
                
                for col in pdf.columns:
                    if pdf[col].dtype == "object":
                        pdf[col] = pdf[col].str.strip()

                dfs_sheets.append(pdf)
            except Exception as e:
                logger.error(f"    üõë Error leyendo hoja '{sheet}' con Spark: {e}")
                continue

        if not dfs_sheets:
            return None
        
        full_pdf = pd.concat(dfs_sheets, ignore_index=True)
        schema = StructType([StructField(c, StringType(), True) for c in col_names])
        df_spk = spark.createDataFrame(full_pdf, schema=schema)
        
        full_df = (
            df_spk
            .withColumn("NOMBRE_ARCHIVO",F.lit(excel_path)) 
            .withColumn("FECHA_CARGA", F.lit(PERIODO))
        )
        
        return full_df
    except Exception as e:
        logger.error(f"   ‚ùå Error cr√≠tico en archivo {excel_path}: {e}")
        return None

In [0]:
def save_bronze_table_v2(process_name: str,  table_name: str, cols_ids: list, cols_nam: list, path_process_final: str, archivos_excel: list) -> bool:
    status = False
    dfs_spark = []
    archivos_ok = []

    try:
        for file_uri in archivos_excel:
            path_local = file_uri.replace("dbfs:", "").replace("file:", "")
            
            try:
                # logger.info(f"   üìñ Leyendo: {path_local.split('/')[-1]}")
                excel_reader = fastexcel.read_excel(path_local)
                
                pdf_list = []
                dtype_map = {idx: "string" for idx in cols_ids}

                for sheet_name in excel_reader.sheet_names:
                    sheet = excel_reader.load_sheet_by_name(
                        sheet_name, 
                        use_columns=cols_ids, 
                        dtypes=dtype_map
                    )
                    pdf = sheet.to_pandas()

                    if len(pdf.columns) != len(cols_nam):
                        print(f"    ‚ö†Ô∏è Hoja '{sheet_name}' ignorada: Cantidad de columnas no coincide.")
                        continue
                    
                    pdf.columns = cols_nam
                    
                    for col in pdf.columns:
                        if pdf[col].dtype == "object":
                            pdf[col] = pdf[col].str.strip()

                    pdf_list.append(pdf)

                if not pdf_list:
                    continue
                
                full_pdf = pd.concat(pdf_list, ignore_index=True)
                schema = StructType([StructField(c, StringType(), True) for c in cols_nam])
                df_spk = spark.createDataFrame(full_pdf, schema=schema)

                df_spk = df_spk.withColumn("FECHA_CARGA", F.lit(PERIODO))

                dfs_spark.append(df_spk)
                archivos_ok.append(file_uri)
            except Exception as e:
                logger.warning(f"   ‚ùå Error en archivo: {path_local}. {e}")
                continue

        if dfs_spark:
            logger.info("   üîÑ Unificando datos...")
            final_df = dfs_spark[0]
            for d in dfs_spark[1:]:
                final_df = final_df.unionAll(d)
                
            logger.info(f"   üíæ Guardando en Tabla Delta ({table_name})...")
            status = save_to_table_delta(final_df, table_name, "append", "false")

            if not status:
                return False
            
            logger.info("   üì¶ Moviendo archivos Procesados...")
            for src in archivos_ok:
                file_name = src.split('/')[-1]
                dst = f"{path_process_final}/{file_name}"
                dbutils.fs.mv(src, dst)
                
        logger.info(f"   ‚úÖ Proceso {process_name} terminado.")
        status = True
    except Exception as e:
        logger.error(f"   ‚ùå Error cr√≠tico ingestando Bronze ({process_name}). {e}")
        status = False

    return status

def save_bronze_table(process_name: str,  table_name: str, cols_ids: list, cols_nam: list, path_process_final: str, archivos_excel: list) -> bool:
    status = False
    files_processed = 0
    total_rows = 0

    try:
        for file_uri in archivos_excel:
            file_name = file_uri.split('/')[-1]

            # gc.collect()

            # df = ingest_excel_to_df_spark(file_uri, cols_ids, cols_nam)
            df = ingest_excel_to_df_fastexcel(file_uri, cols_ids, cols_nam)

            if not df is None:
                # df.persist()
        
                status = save_to_table_delta(df, table_name, "append", "false")

                if not status:
                    del df
                    gc.collect()
                    return False
                
                logger.info(f"   üì¶ Moviendo archivo Procesado {file_name}...")
                dst = f"{path_process_final}/{file_name}"
                dbutils.fs.mv(file_uri, dst)
                # dbutils.fs.cp(file_uri, dst)
                files_processed += 1
                total_rows += df.count()
                del df
            
            gc.collect()

        if files_processed == 0:
            logger.warning(f"   ‚ö†Ô∏è No se pudo cargar ning√∫n archivo en la tabla {table_name}.")
            return False
        
        logger.info(f"   üìä Total Registros Guardados: {total_rows:,.0f}")

        logger.info(f"   ‚úÖ Proceso {process_name} terminado. Archivos cargados: {files_processed}/{len(archivos_excel)}")
        status = True
    except Exception as e:
        logger.error(f"   ‚ùå Error cr√≠tico ingestando Bronze ({process_name}). {e}")
        status = False
    
    return status

In [0]:
def start_process(process: str, folder_path : str, table_name: str, cols_ids: list, cols_nam: list, path_process_final: str) -> bool:
    status = False
    excel_files = get_list_files(process, folder_path)
    if excel_files:
        status = save_bronze_table(process, table_name, cols_ids, cols_nam, path_process_final, excel_files)
    if  validate_table_delta(table_name):
        logger.info(f"   üßπ Optimizando tabla Bronze {process}...")
        spark.sql(f"OPTIMIZE {table_name}")
        status = True

    return status

In [0]:
RUN_EXPUESTOS = True
RUN_CONTRATANTES = True
STATUS = False

try:
    if RUN_EXPUESTOS:
        STATUS = start_process("Expuestos", PATH_INBOX_EXP, TABLE_BRONZE_EXP, COLS_IDX_EXP, COLS_NAM_EXP, PATH_PROCESSED_EXP)

    if RUN_CONTRATANTES:
        STATUS = start_process("Contratantes", PATH_INBOX_CONT, TABLE_BRONZE_CONT, COLS_IDX_CONT, COLS_NAM_CONT, PATH_PROCESSED_CONT)

    if STATUS:
        logger.success("üèÅ Ejecuci√≥n Completa: Proceso Bronze Finalizado con √©xito.")
    else:
        logger.error("üèÅ Ejecuci√≥n Incompleta: Proceso Bronze Finalizado con Error.")
except Exception as e:
    logger.error(f"‚ùå Error cr√≠tico en proceso Bronze. {e}")
finally:
    HORA_FINAL = datetime.now()
    difference_time = HORA_FINAL-HORA_INICIAL
    total_seconds = int(difference_time.total_seconds())
    difference_formated = "{} minuto(s), {} segundo(s)".format((total_seconds // 60), total_seconds % 60)
    logger.info(f"Tiempo de proceso: {difference_formated}")
    close_log()