# Notebook 01: Ingesta de Datos desde API SECOP\n
**Retos 1‑5**

In [None]:
from pyspark.sql import SparkSession\n
from pyspark.sql import functions as F\n
from delta import configure_spark_with_delta_pip\n
from sodapy import Socrata\n
import os\n
import json\n
\n
# Reto 1: SparkSession conectada al cluster\n
master_url = "spark://spark-master:7077"\n
builder = SparkSession.builder \\\n
    .appName("Ingesta_Bronze_SECOP_API") \\\n
    .master(master_url) \\\n
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \\\n
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \\\n
    .config("spark.sql.shuffle.partitions", "4")\n
\n
spark = configure_spark_with_delta_pip(builder).getOrCreate()\n
print("SparkSession iniciada con éxito")

### Reto 2: Descargar datos desde la API de Datos Abiertos Colombia

In [None]:
client = Socrata("www.datos.gov.co", None)\n
dataset_id = "jbjy-vk9h"\n
\n
all_results = []\n
limit = 20000\n
offset = 0\n
print("Descargando contratos del 2° semestre 2025...")\n
\n
while True:\n
    results = client.get(dataset_id, query=f"""\n
        SELECT * \n
        WHERE fecha_de_firma >= '2025-07-01T00:00:00'\n
          AND fecha_de_firma <= '2025-12-31T23:59:59'\n
        LIMIT {limit} OFFSET {offset}\n
    """)\n
    all_results.extend(results)\n
    if len(results) < limit:\n
        break\n
    offset += limit\n
    print(f"  Acumulados: {len(all_results)} registros...")\n
\n
print(f"Total descargado: {len(all_results)} contratos")

### Reto 3: Cargar datos en Spark y explorar el esquema

In [None]:
# Guardar temporalmente en JSON\n
temp_json = "/app/data/raw/temp_secop_2025.json"\n
os.makedirs(os.path.dirname(temp_json), exist_ok=True)\n
with open(temp_json, 'w', encoding='utf-8') as f:\n
    for record in all_results:\n
        f.write(json.dumps(record, ensure_ascii=False) + '\n')\n
\n
df_raw = spark.read.json(temp_json)\n
df_all_string = df_raw.select([F.col(c).cast("string") for c in df_raw.columns])\n
\n
# Limpieza de nombres de columnas\n
df_bronze = df_all_string\n
for col_name in df_bronze.columns:\n
    clean_name = col_name.lower().replace(" ", "_").replace(".", "") \\\n
        .replace("á", "a").replace("é", "e").replace("í", "i").replace("ó", "o").replace("ú", "u") \\\n
        .replace("(", "").replace(")", "").replace(",", "").replace(";", "")\n
    df_bronze = df_bronze.withColumnRenamed(col_name, clean_name)\n
\n
# Columnas de auditoría\n
df_bronze = df_bronze.withColumn("_ingestion_time", F.current_timestamp()) \\\n
                     .withColumn("_source_file", F.lit("API_Socrata_Bogota_2025"))\n
\n
print("Esquema:")\n
df_bronze.printSchema()\n
df_bronze.show(5, truncate=False)

### Reto 4: Seleccionar columnas clave para ML

In [None]:
# Columnas relevantes para modelado y EDA (disponibles al momento de firma)\n
cols_ml = [\n
    "referencia_del_contrato",\n
    "departamento",\n
    "ciudad",\n
    "tipo_de_contrato",\n
    "modalidad_de_contratacion",\n
    "estado_contrato",\n
    "sector",\n
    "orden",\n
    "valor_del_contrato",          # target\n
    "fecha_de_firma",\n
    "dias_adicionados"            # feature numérico\n
]\n
\n
cols_existentes = [c for c in cols_ml if c in df_bronze.columns]\n
df_ml = df_bronze.select(*cols_existentes)\n
print(f"Columnas seleccionadas: {cols_existentes}")\n
df_ml.show(5)

### Reto 5: Guardar en formato Parquet optimizado

In [None]:
# --- Parquet para las fases de ML ---\n
parquet_path = "/opt/spark-data/raw/secop_contratos.parquet"\n
df_ml.coalesce(1).write.mode("overwrite").parquet(parquet_path)\n
print(f"Parquet guardado en: {parquet_path}")\n
\n
# --- Delta para capa Bronze (lakehouse) ---\n
delta_path = "/app/data/lakehouse/bronze/secop"\n
df_bronze.repartition(10).write.format("delta") \\\n
    .mode("overwrite").save(delta_path)\n
print(f"Delta guardado en: {delta_path}")\n
\n
spark.stop()