# Notebook 03: Feature Engineering con Pipeline\n**StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, PCA**

In [None]:
from pyspark.sql import SparkSession\nfrom pyspark.ml.feature import (\n    StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler, PCA\n)\nfrom pyspark.ml import Pipeline\nfrom pyspark.sql.functions import col, when, isnull, upper\nimport pandas as pd\n\nspark = SparkSession.builder \\\n    .appName("SECOP_FeatureEngineering") \\\n    .master("local[*]") \\\n    .config("spark.executor.memory", "8g") \\\n    .getOrCreate()

In [None]:
# Cargar datos limpios del EDA\ndf = spark.read.parquet("/opt/spark-data/processed/secop_eda.parquet")\nprint(f"Registros cargados: {df.count():,}")

### Reto 1: Seleccionar features categóricas y numéricas

In [None]:
categorical_cols = ["departamento", "tipo_de_contrato", "estado_contrato", "modalidad_de_contratacion", "sector", "orden"]\nnumeric_cols = ["dias_adicionados_num"]\ntarget_col = "valor_del_contrato_num"\n\n# Verificar existencia\navailable_cat = [c for c in categorical_cols if c in df.columns]\navailable_num = [c for c in numeric_cols if c in df.columns]\nprint(f"Categóricas: {available_cat}")\nprint(f"Numéricas: {available_num}")

### Reto 2: Estrategia de limpieza (ya aplicada en EDA)

In [None]:
# Verificar nulos\ndf.select([count(when(isnull(c), c)).alias(c) for c in available_cat + available_num + [target_col]]).show()

### Reto 3: VectorAssembler para combinar features

In [None]:
indexers = [StringIndexer(inputCol=col, outputCol=col+"_idx", handleInvalid="keep") for col in available_cat]\nencoders = [OneHotEncoder(inputCol=col+"_idx", outputCol=col+"_vec") for col in available_cat]\n\nfeature_cols = available_num + [col+"_vec" for col in available_cat]\nassembler = VectorAssembler(inputCols=feature_cols, outputCol="features_raw")

### Reto 4: Pipeline completo (orden correcto)

In [None]:
pipeline_stages = indexers + encoders + [assembler]\npipeline = Pipeline(stages=pipeline_stages)\nprint(f"Pipeline con {len(pipeline_stages)} stages")\npipeline_model = pipeline.fit(df)\ndf_transformed = pipeline_model.transform(df)\ndf_transformed.select("features_raw").printSchema()

### Bonus 1: Calcular dimensión total de features post-encoding

In [None]:
sample_vec = df_transformed.select("features_raw").first()[0]\ndimension_raw = len(sample_vec)\nprint(f"Dimensión del vector features_raw: {dimension_raw}")

### Escalamiento y PCA (reducción de dimensionalidad)

In [None]:
scaler = StandardScaler(inputCol="features_raw", outputCol="features_scaled", withStd=True, withMean=True)\nscaler_model = scaler.fit(df_transformed)\ndf_scaled = scaler_model.transform(df_transformed)\n\npca = PCA(k=20, inputCol="features_scaled", outputCol="features_pca")\npca_model = pca.fit(df_scaled)\ndf_final = pca_model.transform(df_scaled)\nprint(f"Varianza explicada por los primeros 20 componentes: {pca_model.explainedVariance.sum():.4f}")

### Bonus 2: Análisis de varianza de features

In [None]:
import matplotlib.pyplot as plt\nexplained_var = pca_model.explainedVariance\ncumulative_var = [sum(explained_var[:i+1]) for i in range(len(explained_var))]\n\nplt.figure(figsize=(10,5))\nplt.plot(range(1,21), cumulative_var, marker='o')\nplt.xlabel('Número de componentes')\nplt.ylabel('Varianza acumulada explicada')\nplt.title('Análisis de Varianza - PCA')\nplt.grid()\nplt.savefig('/opt/spark-data/processed/pca_variance.png', dpi=150)\nplt.show()

## Guardar pipeline y dataset listo para ML

In [None]:
pipeline_path = "/opt/spark-data/processed/feature_pipeline"\npipeline_model.save(pipeline_path)\nprint(f"Pipeline guardado en: {pipeline_path}")\n\noutput_path = "/opt/spark-data/processed/secop_ml_ready.parquet"\ndf_final.select(target_col, "features_pca").withColumnRenamed(target_col, "label").write.mode("overwrite").parquet(output_path)\nprint(f"Dataset listo para ML guardado en: {output_path}")\n\nspark.stop()