# Notebook 05: Regresión Lineal con Evaluación Completa

In [None]:
from pyspark.sql import SparkSession\nfrom pyspark.ml.regression import LinearRegression\nfrom pyspark.ml.evaluation import RegressionEvaluator\nfrom pyspark.sql.functions import col\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\nspark = SparkSession.builder \\\n    .appName("SECOP_RegresionLineal") \\\n    .master("local[*]") \\\n    .getOrCreate()

In [None]:
# Cargar dataset con features PCA y label\ndf = spark.read.parquet("/opt/spark-data/processed/secop_ml_ready.parquet")\ndf = df.withColumnRenamed("features_pca", "features")\ndf.printSchema()\nprint(f"Registros: {df.count():,}")

### Reto 1: Estrategia de train/test split

In [None]:
train, test = df.randomSplit([0.8, 0.2], seed=42)\nprint(f"Train: {train.count():,}")\nprint(f"Test: {test.count():,}")

### Reto 2: Configurar modelo de LinearRegression

In [None]:
lr = LinearRegression(\n    featuresCol="features",\n    labelCol="label",\n    maxIter=100,\n    regParam=0.0,\n    elasticNetParam=0.0,\n    solver="auto"\n)\nlr_model = lr.fit(train)

### Reto 5: Comparar train vs test (detección de overfitting)

In [None]:
train_pred = lr_model.transform(train)\ntest_pred = lr_model.transform(test)\n\nevaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")\n\ntrain_rmse = evaluator.setMetricName("rmse").evaluate(train_pred)\ntest_rmse = evaluator.setMetricName("rmse").evaluate(test_pred)\ntrain_r2 = evaluator.setMetricName("r2").evaluate(train_pred)\ntest_r2 = evaluator.setMetricName("r2").evaluate(test_pred)\n\nprint(f"Train RMSE: {train_rmse:,.2f}, R²: {train_r2:.4f}")\nprint(f"Test  RMSE: {test_rmse:,.2f}, R²: {test_r2:.4f}")\nprint(f"Diferencia RMSE: {(test_rmse - train_rmse):,.2f}")

### Reto 3: Interpretar R²

In [None]:
# R² en test negativo indica que el modelo predice peor que la media\n# Esto sugiere que las features actuales no explican la variabilidad del target

### Reto 4: Analizar calidad de predicciones y errores

In [None]:
test_pred.select("label", "prediction").show(10)\n\n# Error absoluto\ntest_pred = test_pred.withColumn("error_abs", abs(col("label") - col("prediction")))\ntest_pred.select("error_abs").describe().show()

### Reto 6: Analizar coeficientes del modelo

In [None]:
# Los coeficientes corresponden a los 20 componentes PCA\ncoeffs = lr_model.coefficients\nprint("Coeficientes:", coeffs)\nprint(f"Intercepto: {lr_model.intercept:,.2f}")

### Bonus 1: Distribución de residuos

In [None]:
residuals = test_pred.withColumn("residual", col("label") - col("prediction"))\npdf_res = residuals.select("residual").toPandas()\n\nplt.figure(figsize=(10,5))\nplt.hist(pdf_res['residual'], bins=50, edgecolor='black')\nplt.xlabel('Residuo')\nplt.ylabel('Frecuencia')\nplt.title('Distribución de Residuos')\nplt.savefig('/opt/spark-data/processed/residuos_lr.png', dpi=150)\nplt.show()

### Bonus 2: Feature importance aproximado (magnitud de coeficientes)

In [None]:
importance = [abs(c) for c in coeffs]\nplt.figure(figsize=(12,5))\nplt.bar(range(len(importance)), importance)\nplt.xlabel('Componente PCA')\nplt.ylabel('|Coeficiente|')\nplt.title('Importancia de Componentes PCA')\nplt.savefig('/opt/spark-data/processed/feature_importance_lr.png', dpi=150)\nplt.show()

In [None]:
spark.stop()