### ConfiguraciÃ³n inicial

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from delta import configure_spark_with_delta_pip
from sodapy import Socrata
import os
import json

# ConfiguraciÃ³n del Master y Delta
master_url = "spark://spark-master:7077"

builder = SparkSession.builder \
    .appName("Ingesta_Bronze_SECOP_API") \
    .master(master_url) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.shuffle.partitions", "4")

# Inicializar Spark con Delta Lake
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("SparkSession iniciada con Ã©xito")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2ad50802-034e-4ec4-a847-2f29e421cf1f;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 165ms :: artifacts dl 6ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.0.0 from central in [default]
	io.delta#delta-storage;3.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   

SparkSession iniciada con Ã©xito


### Descarga de datos

In [2]:
client = Socrata("www.datos.gov.co", None) 
dataset_id = "jbjy-vk9h" 

all_results = []
limit = 20000  
offset = 0

print("Descargando contratos del aÃ±o 2025...")

while True:
    # Filtro por los ultimos 6 meses de 2025
    results = client.get(dataset_id, query=f""" 
        SELECT * 
        WHERE             
            fecha_de_firma >= '2025-07-01T00:00:00'
        AND 
            fecha_de_firma <= '2025-12-31T23:59:59'
        LIMIT {limit}
        OFFSET {offset}
    """)
    
    all_results.extend(results)
    
    if len(results) > 0:
        print(f" Acumulados: {len(all_results)} registros...")
    
    if len(results) < limit:
        break
        
    offset += limit



Descargando contratos del aÃ±o 2025...


26/01/30 21:18:25 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


 Acumulados: 20000 registros...
 Acumulados: 40000 registros...
 Acumulados: 60000 registros...
 Acumulados: 80000 registros...
 Acumulados: 100000 registros...
 Acumulados: 120000 registros...
 Acumulados: 140000 registros...
 Acumulados: 160000 registros...
 Acumulados: 180000 registros...
 Acumulados: 200000 registros...
 Acumulados: 220000 registros...
 Acumulados: 240000 registros...
 Acumulados: 260000 registros...
 Acumulados: 280000 registros...
 Acumulados: 300000 registros...
 Acumulados: 320000 registros...
 Acumulados: 340000 registros...
 Acumulados: 360000 registros...
 Acumulados: 380000 registros...
 Acumulados: 400000 registros...
 Acumulados: 420000 registros...
 Acumulados: 440000 registros...
 Acumulados: 456617 registros...


### Limpiar nombres de columnas

In [5]:
# Guardamos temporalmente los datos descargados de la API
temp_json = "/app/data/raw/temp_secop_2025.json"
os.makedirs(os.path.dirname(temp_json), exist_ok=True)

with open(temp_json, 'w', encoding='utf-8') as f:
    for record in all_results:
        f.write(json.dumps(record, ensure_ascii=False) + '\n')

# Leer el archivo con Spark
df_raw = spark.read.json(temp_json)

# Convertimos TODAS las columnas a String
df_all_string = df_raw.select([F.col(c).cast("string") for c in df_raw.columns])

# Limpieza de nombres sobre el nuevo DataFrame
df_bronze = df_all_string
for col_name in df_bronze.columns:
    clean_name = col_name.lower().replace(" ", "_").replace(".", "") \
        .replace("Ã¡", "a").replace("Ã©", "e").replace("Ã­", "i").replace("Ã³", "o").replace("Ãº", "u") \
        .replace("(", "").replace(")", "") \
        .replace(",", "").replace(";", "")
    df_bronze = df_bronze.withColumnRenamed(col_name, clean_name)

# AuditorÃ­a
df_bronze = df_bronze.withColumn("_ingestion_time", F.current_timestamp()) \
                     .withColumn("_source_file", F.lit("API_Socrata_Bogota_2025"))

                                                                                

### Guardar datos en Delta

In [9]:
output_path = "/app/data/lakehouse/bronze/secop"
print(f"ðŸ’¾ Guardando {df_bronze.count()} registros en Bronce...")

df_bronze.repartition(10).write.format("delta") \
    .mode("overwrite") \
    .save(output_path)

print("Â¡Capa Bronce finalizada con Ã©xito!")

                                                                                

ðŸ’¾ Guardando 456617 registros en Bronce...




Â¡Capa Bronce finalizada con Ã©xito!


                                                                                

### Leer Delta

In [10]:
bronze_path = "/app/data/lakehouse/bronze/secop"

# Leemos usando el formato "delta"
df_bronze_check = spark.read.format("delta").load(bronze_path)

print(f" Lectura exitosa. Registros en el Lakehouse: {df_bronze_check.count()}")
# Ver las primeras 5 filas con Pandas 
df_bronze_check.limit(5).toPandas()

                                                                                

 Lectura exitosa. Registros en el Lakehouse: 456617


Unnamed: 0,anno_bpin,c_digo_bpin,ciudad,codigo_de_categoria_principal,codigo_entidad,codigo_proveedor,condiciones_de_entrega,departamento,descripcion_del_proceso,descripcion_documentos_tipo,...,valor_amortizado,valor_de_pago_adelantado,valor_del_contrato,valor_facturado,valor_pagado,valor_pendiente_de,valor_pendiente_de_ejecucion,valor_pendiente_de_pago,_ingestion_time,_source_file
0,2025,202500000002779,ChinÃ¡cota,V1.80111600,704851104,730599727,No Definido,Norte de Santander,PRESTAR LOS SERVICIOS PROFESIONALES COMO FORMA...,No definido,...,0,0,12343333,12343333,11500000,0,843333,843333,2026-01-30 22:18:40.643078,API_Socrata_Bogota_2025
1,No D,No Definido,No Definido,V1.80111607,705008498,718547920,Como acordado previamente,No Definido,Brindar acompaÃ±amiento jurÃ­dico y apoyo profes...,No definido,...,0,0,20000000,20000000,20000000,0,0,0,2026-01-30 22:18:40.643078,API_Socrata_Bogota_2025
2,No D,No Definido,Cartagena,V1.85121600,709192637,712389626,No Definido,BolÃ­var,PRESTACIÃ“N DE LOS SERVICIOS COMO GESTORES DE S...,No definido,...,0,0,4000000,4000000,4000000,0,0,0,2026-01-30 22:18:40.643078,API_Socrata_Bogota_2025
3,No D,No Definido,Los Patios,V1.85101601,713088169,731045159,No Definido,Norte de Santander,PRESTAR SUS SERVICIOS EN CONDICIÃ“N DE TÃ‰CNICO ...,No definido,...,0,0,10000000,10000000,10000000,0,0,0,2026-01-30 22:18:40.643078,API_Socrata_Bogota_2025
4,No D,No Definido,Amalfi,V1.81111819,718317027,718871106,A convenir,Antioquia,PrestaciÃ³n de servicios de AsesorÃ­a en Calidad...,No definido,...,0,0,12637800,8425200,8425200,0,4212600,4212600,2026-01-30 22:18:40.643078,API_Socrata_Bogota_2025


26/01/30 22:29:56 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
26/01/30 22:29:56 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:981)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce

In [8]:
df_bronze_check.printSchema()

root
 |-- anno_bpin: string (nullable = true)
 |-- c_digo_bpin: string (nullable = true)
 |-- ciudad: string (nullable = true)
 |-- codigo_de_categoria_principal: string (nullable = true)
 |-- codigo_entidad: string (nullable = true)
 |-- codigo_proveedor: string (nullable = true)
 |-- condiciones_de_entrega: string (nullable = true)
 |-- departamento: string (nullable = true)
 |-- descripcion_del_proceso: string (nullable = true)
 |-- descripcion_documentos_tipo: string (nullable = true)
 |-- destino_gasto: string (nullable = true)
 |-- dias_adicionados: string (nullable = true)
 |-- documento_proveedor: string (nullable = true)
 |-- documentos_tipo: string (nullable = true)
 |-- domicilio_representante_legal: string (nullable = true)
 |-- duraci_n_del_contrato: string (nullable = true)
 |-- el_contrato_puede_ser_prorrogado: string (nullable = true)
 |-- entidad_centralizada: string (nullable = true)
 |-- es_grupo: string (nullable = true)
 |-- es_pyme: string (nullable = true)
 |-- e