# Notebook 01: Ingesta de Datos desde API SECOP\n**Retos 1‑5**

In [None]:
from pyspark.sql import SparkSession\nfrom pyspark.sql import functions as F\nfrom delta import configure_spark_with_delta_pip\nfrom sodapy import Socrata\nimport os\nimport json\n\n# Reto 1: SparkSession conectada al cluster\nmaster_url = "spark://spark-master:7077"\nbuilder = SparkSession.builder \\\n    .appName("Ingesta_Bronze_SECOP_API") \\\n    .master(master_url) \\\n    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \\\n    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \\\n    .config("spark.sql.shuffle.partitions", "4")\n\nspark = configure_spark_with_delta_pip(builder).getOrCreate()\nprint("SparkSession iniciada con éxito")

### Reto 2: Descargar datos desde la API de Datos Abiertos Colombia

In [None]:
client = Socrata("www.datos.gov.co", None)\ndataset_id = "jbjy-vk9h"\n\nall_results = []\nlimit = 20000\noffset = 0\nprint("Descargando contratos del 2° semestre 2025...")\n\nwhile True:\n    results = client.get(dataset_id, query=f"""\n        SELECT * \n        WHERE fecha_de_firma >= '2025-07-01T00:00:00'\n          AND fecha_de_firma <= '2025-12-31T23:59:59'\n        LIMIT {limit} OFFSET {offset}\n    """)\n    all_results.extend(results)\n    if len(results) < limit:\n        break\n    offset += limit\n    print(f"  Acumulados: {len(all_results)} registros...")\n\nprint(f"Total descargado: {len(all_results)} contratos")

### Reto 3: Cargar datos en Spark y explorar el esquema

In [None]:
# Guardar temporalmente en JSON\ntemp_json = "/app/data/raw/temp_secop_2025.json"\nos.makedirs(os.path.dirname(temp_json), exist_ok=True)\nwith open(temp_json, 'w', encoding='utf-8') as f:\n    for record in all_results:\n        f.write(json.dumps(record, ensure_ascii=False) + '\n')\n\ndf_raw = spark.read.json(temp_json)\ndf_all_string = df_raw.select([F.col(c).cast("string") for c in df_raw.columns])\n\n# Limpieza de nombres de columnas\ndf_bronze = df_all_string\nfor col_name in df_bronze.columns:\n    clean_name = col_name.lower().replace(" ", "_").replace(".", "") \\\n        .replace("á", "a").replace("é", "e").replace("í", "i").replace("ó", "o").replace("ú", "u") \\\n        .replace("(", "").replace(")", "").replace(",", "").replace(";", "")\n    df_bronze = df_bronze.withColumnRenamed(col_name, clean_name)\n\n# Columnas de auditoría\ndf_bronze = df_bronze.withColumn("_ingestion_time", F.current_timestamp()) \\\n                     .withColumn("_source_file", F.lit("API_Socrata_Bogota_2025"))\n\nprint("Esquema:")\ndf_bronze.printSchema()\ndf_bronze.show(5, truncate=False)

### Reto 4: Seleccionar columnas clave para ML

In [None]:
# Columnas relevantes para modelado y EDA (disponibles al momento de firma)\ncols_ml = [\n    "referencia_del_contrato",\n    "departamento",\n    "ciudad",\n    "tipo_de_contrato",\n    "modalidad_de_contratacion",\n    "estado_contrato",\n    "sector",\n    "orden",\n    "valor_del_contrato",          # target\n    "fecha_de_firma",\n    "dias_adicionados"            # feature numérico\n]\n\ncols_existentes = [c for c in cols_ml if c in df_bronze.columns]\ndf_ml = df_bronze.select(*cols_existentes)\nprint(f"Columnas seleccionadas: {cols_existentes}")\ndf_ml.show(5)

### Reto 5: Guardar en formato Parquet optimizado

In [None]:
# --- Parquet para las fases de ML ---\nparquet_path = "/opt/spark-data/raw/secop_contratos.parquet"\ndf_ml.coalesce(1).write.mode("overwrite").parquet(parquet_path)\nprint(f"Parquet guardado en: {parquet_path}")\n\n# --- Delta para capa Bronze (lakehouse) ---\ndelta_path = "/app/data/lakehouse/bronze/secop"\ndf_bronze.repartition(10).write.format("delta") \\\n    .mode("overwrite").save(delta_path)\nprint(f"Delta guardado en: {delta_path}")\n\nspark.stop()