# Notebook 10: MLflow Tracking con Experimentos y Artefactos

In [None]:
from pyspark.sql import SparkSession\nfrom pyspark.ml.regression import LinearRegression\nfrom pyspark.ml.evaluation import RegressionEvaluator\nfrom pyspark.sql.functions import col\nimport mlflow\nimport mlflow.spark\nimport matplotlib.pyplot as plt\nimport pandas as pd\n\nspark = SparkSession.builder \\\n    .appName("SECOP_MLflow") \\\n    .master("local[*]") \\\n    .getOrCreate()

### Reto 1: Configurar MLflow tracking server y experimento

In [None]:
mlflow.set_tracking_uri("http://mlflow:5000")\nexperiment_name = "secop_prediccion"\nmlflow.set_experiment(experiment_name)

In [None]:
# Cargar datos PCA\ndf = spark.read.parquet("/opt/spark-data/processed/secop_ml_ready.parquet")\ndf = df.withColumnRenamed("features_pca", "features")\ndf = df.filter(col("label").isNotNull())\ntrain, test = df.randomSplit([0.8, 0.2], seed=42)\n\nevaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")\nevaluator_mae = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae")\nevaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

### Reto 2: Experimento baseline (sin regularización)

In [None]:
with mlflow.start_run(run_name="baseline_model"):\n    mlflow.log_param("regParam", 0.0)\n    mlflow.log_param("elasticNetParam", 0.0)\n    mlflow.log_param("maxIter", 100)\n    \n    lr = LinearRegression(featuresCol="features", labelCol="label", regParam=0.0, elasticNetParam=0.0, maxIter=100)\n    model = lr.fit(train)\n    pred = model.transform(test)\n    \n    rmse = evaluator_rmse.evaluate(pred)\n    mae = evaluator_mae.evaluate(pred)\n    r2 = evaluator_r2.evaluate(pred)\n    \n    mlflow.log_metric("rmse", rmse)\n    mlflow.log_metric("mae", mae)\n    mlflow.log_metric("r2", r2)\n    mlflow.spark.log_model(model, "model")\n    \n    print(f"RMSE: {rmse:,.2f}, R²: {r2:.4f}")

### Reto 3: Registrar múltiples modelos (Ridge, Lasso, ElasticNet)

In [None]:
configs = [\n    ("ridge_l2", 0.1, 0.0),\n    ("lasso_l1", 0.1, 1.0),\n    ("elasticnet", 0.1, 0.5)\n]\n\nfor name, reg, elastic in configs:\n    with mlflow.start_run(run_name=name):\n        mlflow.log_param("regParam", reg)\n        mlflow.log_param("elasticNetParam", elastic)\n        mlflow.log_param("maxIter", 100)\n        \n        lr = LinearRegression(featuresCol="features", labelCol="label",\n                              regParam=reg, elasticNetParam=elastic, maxIter=100)\n        model = lr.fit(train)\n        pred = model.transform(test)\n        \n        rmse = evaluator_rmse.evaluate(pred)\n        mae = evaluator_mae.evaluate(pred)\n        r2 = evaluator_r2.evaluate(pred)\n        \n        mlflow.log_metric("rmse", rmse)\n        mlflow.log_metric("mae", mae)\n        mlflow.log_metric("r2", r2)\n        mlflow.spark.log_model(model, "model")\n        print(f"{name} - RMSE: {rmse:,.2f}, R²: {r2:.4f}")

### Reto 5: Agregar artefactos personalizados (reportes, gráficos)

In [None]:
with mlflow.start_run(run_name="modelo_con_artefactos"):\n    lr = LinearRegression(featuresCol="features", labelCol="label", regParam=0.01, elasticNetParam=0.5, maxIter=100)\n    model = lr.fit(train)\n    pred = model.transform(test)\n    \n    rmse = evaluator_rmse.evaluate(pred)\n    r2 = evaluator_r2.evaluate(pred)\n    mlflow.log_metric("rmse", rmse)\n    mlflow.log_metric("r2", r2)\n    \n    # Gráfico 1: Predicciones vs Reales\n    pdf = pred.select("label", "prediction").limit(500).toPandas()\n    plt.figure(figsize=(8,6))\n    plt.scatter(pdf['label'], pdf['prediction'], alpha=0.5)\n    plt.plot([pdf['label'].min(), pdf['label'].max()], [pdf['label'].min(), pdf['label'].max()], 'r--')\n    plt.xlabel('Valor Real')\n    plt.ylabel('Predicción')\n    plt.title('Predicciones vs Reales')\n    plt.tight_layout()\n    plt.savefig('/tmp/pred_vs_real.png')\n    mlflow.log_artifact('/tmp/pred_vs_real.png')\n    \n    # Gráfico 2: Residuos\n    resid = pred.withColumn("resid", col("label") - col("prediction")).select("resid").toPandas()\n    plt.figure(figsize=(8,4))\n    plt.hist(resid['resid'], bins=50, edgecolor='black')\n    plt.xlabel('Residuo')\n    plt.ylabel('Frecuencia')\n    plt.title('Histograma de Residuos')\n    plt.savefig('/tmp/residuos.png')\n    mlflow.log_artifact('/tmp/residuos.png')\n    \n    mlflow.spark.log_model(model, "model")\n    print("Artefactos guardados en MLflow")

In [None]:
spark.stop()