# Notebook 01: Ingesta de Datos desde API SECOP 


In [12]:
from pyspark.sql import SparkSession 
from pyspark.sql import functions as F 
from delta import configure_spark_with_delta_pip 
from sodapy import Socrata
import os 

# Reto 1: SparkSession 
master_url = "spark://spark-master:7077" 
builder = SparkSession.builder \
    .appName("Ingesta_Bronze_SECOP_API") \
    .master(master_url) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "4") 

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
# Reto 2 y 3: Descarga y conversión eficiente
client = Socrata("www.datos.gov.co", None) 
dataset_id = "jbjy-vk9h" 
limit = 50000
offset = 0 

print("Descargando y procesando por lotes...")

# Inicializamos un DF vacío con el esquema que traerá la API
df_final = None

while True: 
    # Usamos SELECT específicos para traer solo lo necesario desde la API si es posible
    results = client.get(dataset_id, query=f""" 
        SELECT * WHERE fecha_de_firma >= '2025-07-01T00:00:00' 
          AND fecha_de_firma <= '2025-12-31T23:59:59' 
        LIMIT {limit} OFFSET {offset} 
    """)
    
    if not results:
        break
        
    batch_df = spark.createDataFrame(results)
    
    if df_final is None:
        df_final = batch_df
    else:
        df_final = df_final.unionByName(batch_df, allowMissingColumns=True)
    
    print(f"  Procesados: {offset + len(results)} registros...")
    
    if len(results) < limit: 
        break 
    offset += limit 

# --- Procesamiento en Spark  ---
# Limpieza de nombres de columnas
def clean_column_name(name):
    replacements = {" ": "_", ".": "", "á": "a", "é": "e", "í": "i", "ó": "o", "ú": "u", "(": "", ")": "", ",": "", ";": ""}
    res = name.lower()
    for k, v in replacements.items():
        res = res.replace(k, v)
    return res

# Aplicamos transformación masiva
df_bronze = df_final.select([F.col(c).alias(clean_column_name(c)) for c in df_final.columns])

df_bronze = df_bronze.withColumn("_ingestion_time", F.current_timestamp()) \
                     .withColumn("_source_file", F.lit("API_Socrata_Bogota_2025"))



Descargando y procesando por lotes...
  Procesados: 50000 registros...
  Procesados: 100000 registros...
  Procesados: 150000 registros...
  Procesados: 200000 registros...
  Procesados: 250000 registros...
  Procesados: 300000 registros...
  Procesados: 350000 registros...
  Procesados: 400000 registros...
  Procesados: 450000 registros...
  Procesados: 456579 registros...


In [14]:
# Reto 4: Selección de columnas
cols_ml = ["referencia_del_contrato", "departamento", "ciudad", "tipo_de_contrato", 
           "modalidad_de_contratacion", "estado_contrato", "sector", "orden", 
           "valor_del_contrato", "fecha_de_firma", "dias_adicionados"] 

cols_existentes = [c for c in cols_ml if c in df_bronze.columns] 
df_ml = df_bronze.select(*cols_existentes)

In [None]:
# Reto 5: Guardado Optimizado
parquet_path = "/opt/spark-data/raw/secop_contratos.parquet" 
df_ml.write.mode("overwrite").parquet(parquet_path) 

delta_path = "/opt/spark-data/processed/lakehouse/bronze/secop" 
df_bronze.write.format("delta").mode("overwrite").save(delta_path) 

print("Proceso finalizado exitosamente")
spark.stop()

26/02/15 00:17:20 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
26/02/15 00:17:21 WARN TaskSetManager: Stage 0 contains a task of very large size (42056 KiB). The maximum recommended task size is 1000 KiB.
26/02/15 00:17:42 WARN TaskSetManager: Stage 1 contains a task of very large size (42056 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Proceso finalizado exitosamente
