# Notebook 02: Análisis Exploratorio y Limpieza\n**Fusión de EDA + transformación Silver (solo columnas ML)**

In [None]:
from pyspark.sql import SparkSession\nfrom pyspark.sql.functions import (\n    col, count, sum as spark_sum, avg, min as spark_min, max as spark_max,\n    stddev, isnan, when, isnull, desc, to_date, year, month, percentile_approx,\n    datediff, upper, lit\n)\nfrom pyspark.sql.types import DecimalType, TimestampType, DateType\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport pandas as pd\n\nspark = SparkSession.builder \\\n    .appName("SECOP_EDA_Clean") \\\n    .master("spark://spark-master:7077") \\\n    .config("spark.executor.memory", "2g") \\\n    .getOrCreate()\n\nprint(f"Spark Version: {spark.version}")

In [None]:
# Cargar datos desde Parquet generado en ingesta\ndf = spark.read.parquet("/opt/spark-data/raw/secop_contratos.parquet")\nprint(f"Registros crudos: {df.count():,}")\nprint(f"Columnas: {len(df.columns)}")\ndf.printSchema()

## 1. Estadísticas descriptivas (Reto 1)

In [None]:
# Convertir target a numérico\ndf = df.withColumn("valor_del_contrato_num", col("valor_del_contrato").cast("double"))\ndf.describe(["valor_del_contrato_num", "dias_adicionados"]).show()

## 2. Análisis de valores nulos y estrategia (Reto 2)

In [None]:
null_counts = df.select([\n    count(when(isnull(c) | isnan(c), c)).alias(c)\n    for c in df.columns\n])\nnull_df = null_counts.toPandas().T\nnull_df.columns = ['null_count']\nnull_df['null_percentage'] = (null_df['null_count'] / df.count()) * 100\nnull_df = null_df.sort_values('null_count', ascending=False)\nprint(null_df[null_df['null_count'] > 0])\n\n# Estrategia: eliminar filas con target nulo; para categóricas se imputará más adelante o se usarán como "Desconocido"\ndf = df.filter(col("valor_del_contrato_num").isNotNull())

## 3. Explorar variable objetivo (Reto 3)

In [None]:
df.select(\n    spark_min("valor_del_contrato_num").alias("Min"),\n    spark_max("valor_del_contrato_num").alias("Max"),\n    avg("valor_del_contrato_num").alias("Promedio"),\n    stddev("valor_del_contrato_num").alias("Desv_Std"),\n    percentile_approx("valor_del_contrato_num", 0.5).alias("Mediana")\n).show()\n\n# Distribución por rangos\ndf.select(\n    count(when(col("valor_del_contrato_num") < 1e7, True)).alias("< 10M"),\n    count(when((col("valor_del_contrato_num") >= 1e7) & (col("valor_del_contrato_num") < 1e8), True)).alias("10M-100M"),\n    count(when((col("valor_del_contrato_num") >= 1e8) & (col("valor_del_contrato_num") < 1e9), True)).alias("100M-1B"),\n    count(when(col("valor_del_contrato_num") >= 1e9, True)).alias("> 1B")\n).show()

## 4. Distribución por departamento (Reto 4)

In [None]:
df_dept = df.groupBy("departamento") \\\n    .agg(\n        count("*").alias("num_contratos"),\n        spark_sum("valor_del_contrato_num").alias("valor_total")\n    ) \\\n    .orderBy(desc("num_contratos"))\ndf_dept.show(10, truncate=False)\n\n# Gráfico\npdf_dept = df_dept.limit(10).toPandas()\nplt.figure(figsize=(12,5))\nplt.subplot(1,2,1)\nplt.barh(pdf_dept['departamento'], pdf_dept['num_contratos'])\nplt.xlabel('Número de Contratos')\nplt.title('Top 10 Departamentos por Contratos')\nplt.subplot(1,2,2)\nplt.barh(pdf_dept['departamento'], pdf_dept['valor_total']/1e9)\nplt.xlabel('Valor Total (Miles de Millones COP)')\nplt.title('Top 10 Departamentos por Valor Total')\nplt.tight_layout()\nplt.savefig('/opt/spark-data/processed/eda_departamentos.png', dpi=150, bbox_inches='tight')\nplt.show()

## 5. Tipo de contrato y estado (Reto 5)

In [None]:
df.groupBy("tipo_de_contrato").count().orderBy(desc("count")).show(10, truncate=False)\ndf.groupBy("estado_contrato").count().orderBy(desc("count")).show(10, truncate=False)

## 6. Detección de outliers con IQR (Reto 6)

In [None]:
percentiles = df.approxQuantile("valor_del_contrato_num", [0.25, 0.5, 0.75], 0.01)\nq1, q3 = percentiles[0], percentiles[2]\niqr = q3 - q1\nlower_bound = q1 - 1.5 * iqr\nupper_bound = q3 + 1.5 * iqr\n\nnum_outliers = df.filter(\n    (col("valor_del_contrato_num") < lower_bound) |\n    (col("valor_del_contrato_num") > upper_bound)\n).count()\n\nprint(f"Q1: {q1:,.2f}, Q3: {q3:,.2f}, IQR: {iqr:,.2f}")\nprint(f"Rango normal: ${lower_bound:,.2f} - ${upper_bound:,.2f}")\nprint(f"Outliers: {num_outliers:,} ({num_outliers/df.count()*100:.2f}%)")\n\n# Estrategia: eliminamos outliers extremos para mejorar el modelo\ndf_clean = df.filter(\n    (col("valor_del_contrato_num") >= lower_bound) &\n    (col("valor_del_contrato_num") <= upper_bound)\n)

## Bonus: Análisis temporal

In [None]:
df_temporal = df_clean.withColumn("fecha_firma_dt", to_date("fecha_de_firma")) \\\n                     .withColumn("anio", year("fecha_firma_dt")) \\\n                     .withColumn("mes", month("fecha_firma_dt"))\ndf_temporal.groupBy("anio", "mes").count().orderBy("anio", "mes").show()

## Limpieza adicional (calidad de datos)

In [None]:
# Filtrar valores inválidos en columnas clave\ndf_clean = df_clean.filter(\n    col("departamento").isNotNull() &\n    (upper(col("departamento")) != "NO DEFINIDO") &\n    col("tipo_de_contrato").isNotNull() &\n    col("estado_contrato").isNotNull()\n)\n\n# Convertir días adicionados a numérico (rellenar nulos con 0)\ndf_clean = df_clean.withColumn("dias_adicionados_num", col("dias_adicionados").cast("int"))\ndf_clean = df_clean.fillna({"dias_adicionados_num": 0})\n\nprint(f"Registros después de limpieza: {df_clean.count():,}")

## Guardar dataset limpio para feature engineering

In [None]:
output_path = "/opt/spark-data/processed/secop_eda.parquet"\ndf_clean.write.mode("overwrite").parquet(output_path)\nprint(f"Dataset limpio guardado en: {output_path}")\n\nspark.stop()