In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
)
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, stddev, variance, var_pop
import os

# --- Configurar SparkSession conectada al clúster ---
spark = SparkSession.builder \
    .appName("SECOP_FeatureEngineering") \
    .master("spark://spark-master:7077") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

# Cargar datos del EDA (Fase 1)
df = spark.read.parquet("/opt/spark-data/processed/secop_eda_final.parquet").cache()




Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/15 01:39:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [None]:
# %% RETO 1: Seleccionar features categóricas y numéricas
categorical_cols = ["departamento", "tipo_de_contrato", "estado_contrato"]
numeric_cols = ["dias_adicionados_num"] # Agrega aquí otras como 'plazo_ejecucion' si existen
target_col = "valor_del_contrato_num"

# %% RETO 2: Implementar estrategia de limpieza de datos
cols_to_check = categorical_cols + numeric_cols + [target_col]
df_clean = df.dropna(subset=cols_to_check)

# %% CONSTRUCCIÓN DE STAGES PARA EL PIPELINE

# 1. StringIndexer: Texto -> Índices numéricos
indexers = [
    StringIndexer(inputCol=c, outputCol=f"{c}_idx", handleInvalid="keep")
    for c in categorical_cols
]

# 2. OneHotEncoder: Índices -> Vectores binarios (SparseVectors)
encoders = [
    OneHotEncoder(inputCol=f"{c}_idx", outputCol=f"{c}_vec")
    for c in categorical_cols
]


26/02/15 01:40:06 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [None]:
# %% RETO 3: Crear VectorAssembler para combinar features
assembler_inputs = numeric_cols + [f"{c}_vec" for c in categorical_cols]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features_unscaled")

# 3. StandardScaler: Importante para modelos de regresión
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=False)

# %% RETO 4: Construir Pipeline completo: Indexar -> Codificar -> Ensamblar -> Escalar
pipeline_stages = indexers + encoders + [assembler, scaler]
pipeline = Pipeline(stages=pipeline_stages)

# Entrenar y transformar
print("Entrenando Pipeline...")
pipeline_model = pipeline.fit(df_clean)
df_transformed = pipeline_model.transform(df_clean)

# %% BONUS 1: Calcular dimensión total de features post-encoding
sample_row = df_transformed.select("features").first()[0]
print(f"\n>>> Dimensión total del vector de entrada: {len(sample_row)}")

# %% BONUS 2: Análisis de varianza de la variable objetivo
print("\n--- Análisis de Varianza de la Variable Objetivo ---")
df_transformed.select(var_pop(target_col).alias("varianza_target")).show()

Entrenando Pipeline...


                                                                                


>>> Dimensión total del vector de entrada: 64

--- Análisis de Varianza de la Variable Objetivo ---
+-------------------+
|    varianza_target|
+-------------------+
|7.48872791080346E19|
+-------------------+



In [None]:

# %% GUARDADO DE RESULTADOS
pipeline_path = "/opt/spark-data/processed/feature_pipeline_model"
output_data_path = "/opt/spark-data/processed/secop_features.parquet"

pipeline_model.write().overwrite().save(pipeline_path)
df_transformed.select("features", target_col).write.mode("overwrite").parquet(output_data_path)

print(f"Pipeline y datos guardados exitosamente.")

                                                                                

Pipeline y datos guardados exitosamente.


26/02/15 02:09:43 ERROR StandaloneSchedulerBackend: Application has been killed. Reason: Master removed our application: KILLED
26/02/15 02:09:43 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exiting due to error from cluster scheduler: Master removed our application: KILLED
	at org.apache.spark.errors.SparkCoreErrors$.clusterSchedulerError(SparkCoreErrors.scala:291)
	at org.apache.spark.scheduler.TaskSchedulerImpl.error(TaskSchedulerImpl.scala:981)
	at org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend.dead(StandaloneSchedulerBackend.scala:165)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint.markDead(StandaloneAppClient.scala:263)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anonfun$receive$1.applyOrElse(StandaloneAppClient.scala:170)
	at org.apache.spark.rpc.netty.Inbox.$anonfun$process$1(Inbox.scala:115)
	at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:213)
	at org.apache.spark.rpc.netty.Inbox.proce