In [0]:
path_gold = 'abfss://datalake@dls0tfm.dfs.core.windows.net/gold/'

In [0]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, LinearRegressionModel, GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow
import mlflow.spark
from datetime import datetime

## Ganancias

In [0]:
df_dias = spark.read.format('delta').load(f'{path_gold}dias')

In [0]:
df_dias.printSchema

<bound method DataFrame.printSchema of DataFrame[fecha: date, dia_semana: int, mes: int, ano: int, tipo: string, ganancia: double, tavg: double, tmin: double, tmax: double, prcp: double, snow: int, wspd: double, wpgt: double, pres: double, tsun: double, processing_date: date]>

In [0]:
categorical_cols = [field for (field, dataType) in df_dias.dtypes if dataType == "string"]
index_output_cols = [x + "Index" for x in categorical_cols]
string_indexer = StringIndexer(inputCols=categorical_cols, outputCols=index_output_cols, handleInvalid="skip")

In [0]:
numerical_cols = [field for (field, dataType) in df_dias.dtypes if dataType in ('int','double') and field not in ['ganancia']]
numerical_cols


['dia_semana',
 'mes',
 'ano',
 'tavg',
 'tmin',
 'tmax',
 'prcp',
 'snow',
 'wspd',
 'wpgt',
 'pres',
 'tsun']

In [0]:
train_df, test_df = df_dias.randomSplit([.8, .2], seed=124)

In [0]:
assembler_inputs = index_output_cols + numerical_cols
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")


### Regresión multilineal

In [0]:
lr = LinearRegression(featuresCol='features', labelCol='ganancia')
pipeline = Pipeline(stages=[string_indexer, vec_assembler, lr])

In [0]:
#print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: ganancia)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)
maxIter: max number of it

In [0]:
# parámetros

param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .addGrid(lr.maxIter, [50, 75, 100]) \
    .build()

evaluator = RegressionEvaluator(labelCol='ganancia', predictionCol='prediction')

cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=param_grid, numFolds=3, seed=17)

In [0]:
run_name = f"ganancia_{datetime.now().strftime('%Y%m%d_%H%M%S')}" # la segunda parte es el formato en que queremos el datetime
run_name = f"ganancia_{datetime.now().strftime('%Y%m%d_%H%M%S')}" 
with mlflow.start_run(run_name=run_name) as run:

    # Entrenamiento
    pipeline_model = cv.fit(train_df)

    # Predicciones
    pred_df = pipeline_model.transform(test_df) # transform es equivalente al predict de scikit-learn

    # Métricas
    rmse = evaluator.evaluate(pred_df)
    r2_evaluator = evaluator.copy({}).setMetricName("r2")
    r2 = r2_evaluator.evaluate(pred_df)
    
    print(f"RMSE: {rmse}")
    print(f"R2: {r2}")

    # Logging a MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Logging hiperparámetros (opcional)
    avg_metrics = pipeline_model.avgMetrics
    param_maps = pipeline_model.getEstimatorParamMaps()
    for i, (params, metric) in enumerate(zip(param_maps, avg_metrics)):
        mlflow.log_metric(f"fold_{i}_metric", metric)

   # Guardamos el pipeline completo, que incluye al mejor modelo y las transformaciones del pipeline, necesarias para la predicción
    mlflow.spark.log_model(pipeline_model, "full_pipeline")

    # Opcional: Logging del mejor modelo (última etapa del pipeline contiene el CV), vamos a necesitar el pipeline para predecir en cualquier caso
    best_model = pipeline_model.bestModel
    mlflow.spark.log_model(best_model, "best_model")
    lr_model = best_model.stages[-1]
    assert isinstance(lr_model, LinearRegressionModel)
    coef_list = lr_model.coefficients.toArray().tolist()
    intercept = lr_model.intercept
    for i, coef in enumerate(coef_list):
        mlflow.log_param(f"best_coef_{i}", coef)
    mlflow.log_param("best_intercept", intercept)

RMSE: 375.7152300967988
R2: 0.5181991650598177


2025/09/08 13:11:45 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/65 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

2025/09/08 13:12:33 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
# Repetimos el experimento sin considerar la variable tsun, debido a su coeficiente cercano a 0.

In [0]:
assembler_inputs2 = assembler_inputs[:-1]
vec_assembler2 = VectorAssembler(inputCols=assembler_inputs2, outputCol="features")
lr2 = LinearRegression(featuresCol='features', labelCol='ganancia')
pipeline2 = Pipeline(stages=[string_indexer, vec_assembler2, lr2])

In [0]:
cv2 = CrossValidator(estimator=pipeline2, evaluator=evaluator, estimatorParamMaps=param_grid, numFolds=3, seed=17)

In [0]:

run_name = f"ganancia_{datetime.now().strftime('%Y%m%d_%H%M%S')}" 
with mlflow.start_run(run_name=run_name) as run:

    # Entrenamiento
    pipeline_model2 = cv2.fit(train_df)

    # Predicciones
    pred_df = pipeline_model2.transform(test_df) # transform es equivalente al predict de scikit-learn

    # Métricas
    rmse = evaluator.evaluate(pred_df)
    r2_evaluator = evaluator.copy({}).setMetricName("r2")
    r2 = r2_evaluator.evaluate(pred_df)
    
    print(f"RMSE: {rmse}")
    print(f"R2: {r2}")

    # Logging a MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Logging hiperparámetros (opcional)
    #avg_metrics = pipeline_model.avgMetrics
    #param_maps = pipeline_model.getEstimatorParamMaps()
    #for i, (params, metric) in enumerate(zip(param_maps, avg_metrics)):
    #    mlflow.log_metric(f"fold_{i}_metric", metric)

    print("Tipo:", type(pipeline_model2))
    print("Tiene bestModel:", hasattr(pipeline_model2, "bestModel"))

    if hasattr(pipeline_model2, "bestModel"):
        print("Tipo de bestModel:", type(pipeline_model2.bestModel))
    
    # ¿Tiene etapas?
    if hasattr(pipeline_model2.bestModel, "stages"):
        print("Etapas del bestModel:", pipeline_model2.bestModel.stages)
    else:
        print(" El bestModel no tiene etapas (no es un PipelineModel)")

   # Guardamos el pipeline completo, que incluye al mejor modelo y las transformaciones del pipeline, necesarias para la predicción
    #mlflow.spark.log_model(pipeline_model2, "full_pipeline")

    # Opcional: Logging del mejor modelo (última etapa del pipeline contiene el CV), vamos a necesitar el pipeline para predecir en cualquier caso
    best_model = pipeline_model2.bestModel
    mlflow.spark.log_model(best_model, "best_model")
    lr_model = best_model.stages[-1]
    assert isinstance(lr_model, LinearRegressionModel)
    coef_list = lr_model.coefficients.toArray().tolist()
    intercept = lr_model.intercept
    for i, coef in enumerate(coef_list):
        mlflow.log_param(f"best_coef_{i}", coef)
    mlflow.log_param("best_intercept", intercept)

RMSE: 375.915680229751
R2: 0.5176849308931606
Tipo: <class 'pyspark.ml.tuning.CrossValidatorModel'>
Tiene bestModel: True
Tipo de bestModel: <class 'pyspark.ml.pipeline.PipelineModel'>
Etapas del bestModel: [StringIndexerModel: uid=StringIndexer_15355b634383, handleInvalid=skip, numInputCols=1, numOutputCols=1, VectorAssembler_76eab07e8882, LinearRegressionModel: uid=LinearRegression_99a76127d68e, numFeatures=12]


2025/09/08 13:42:16 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/30 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

### Random forest

In [0]:
rf = RandomForestRegressor(featuresCol='features', labelCol='ganancia')
pipeline_rf = Pipeline(stages=[string_indexer, vec_assembler, rf])
param_grid_rf = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [2, 5, 10, 15]) \
    .addGrid(rf.numTrees, [5, 10, 15]) \
    .build()\

evaluator = RegressionEvaluator(labelCol='ganancia', predictionCol='prediction')

cv_rf = CrossValidator(estimator=pipeline_rf, evaluator=evaluator, estimatorParamMaps=param_grid_rf, numFolds=3, seed=40)

In [0]:
run_name = f"rf_ganancia_{datetime.now().strftime('%Y%m%d_%H%M%S')}" 
with mlflow.start_run(run_name=run_name) as run:

    # Entrenamiento
    cv_model = cv_rf.fit(train_df)

    # Predicciones
    pred_df = cv_model.transform(test_df) # transform es equivalente al predict de scikit-learn

    # Métricas
    rmse = evaluator.evaluate(pred_df)
    r2_evaluator = evaluator.copy({}).setMetricName("r2")
    r2 = r2_evaluator.evaluate(pred_df)
    
    print(f"RMSE: {rmse}")
    print(f"R2: {r2}")

    # Logging a MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Logging hiperparámetros (opcional)
    avg_metrics = cv_model.avgMetrics
    param_maps = cv_model.getEstimatorParamMaps()
    for i, (params, metric) in enumerate(zip(param_maps, avg_metrics)):
        mlflow.log_metric(f"fold_{i}_metric", metric)

   # Guardamos el pipeline completo, que incluye al mejor modelo y las transformaciones del pipeline, necesarias para la predicción
    mlflow.spark.log_model(cv_model, "full_pipeline")

    # Opcional: Logging del mejor modelo (última etapa del pipeline contiene el CV), vamos a necesitar el pipeline para predecir en cualquier caso
    best_model = cv_model.bestModel
    mlflow.spark.log_model(best_model, "best_model")

RMSE: 344.9016365030823
R2: 0.5939865149863153


2025/09/08 13:56:31 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/70 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

2025/09/08 13:57:23 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/35 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
rf = RandomForestRegressor(featuresCol='features', labelCol='ganancia')
pipeline_rf = Pipeline(stages=[string_indexer, vec_assembler, rf])
param_grid_rf = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [2, 5, 10, 15]) \
    .addGrid(rf.numTrees, [5, 10, 15]) \
    .build()\

evaluator = RegressionEvaluator(labelCol='ganancia', predictionCol='prediction')

cv_rf = CrossValidator(estimator=pipeline_rf, evaluator=evaluator, estimatorParamMaps=param_grid_rf, numFolds=3, seed=40)

In [0]:
run_name = f"rf_ganancia_{datetime.now().strftime('%Y%m%d_%H%M%S')}" 
with mlflow.start_run(run_name=run_name) as run:

    # Entrenamiento
    cv_model = cv_rf.fit(train_df)

    # Predicciones
    pred_df = cv_model.transform(test_df) # transform es equivalente al predict de scikit-learn

    # Métricas
    rmse = evaluator.evaluate(pred_df)
    r2_evaluator = evaluator.copy({}).setMetricName("r2")
    r2 = r2_evaluator.evaluate(pred_df)
    
    print(f"RMSE: {rmse}")
    print(f"R2: {r2}")

    # Logging a MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Logging hiperparámetros (opcional)
    avg_metrics = cv_model.avgMetrics
    param_maps = cv_model.getEstimatorParamMaps()
    for i, (params, metric) in enumerate(zip(param_maps, avg_metrics)):
        mlflow.log_metric(f"fold_{i}_metric", metric)

   # Guardamos el pipeline completo, que incluye al mejor modelo y las transformaciones del pipeline, necesarias para la predicción
    #mlflow.spark.log_model(cv_model, "full_pipeline")

    # Opcional: Logging del mejor modelo (última etapa del pipeline contiene el CV), vamos a necesitar el pipeline para predecir en cualquier caso
    best_model = cv_model.bestModel
    mlflow.spark.log_model(best_model, "best_model")

RMSE: 344.9016365030823
R2: 0.5939865149863153


2025/09/09 13:06:55 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/35 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

### Gradient boosting

In [0]:

gb = GBTRegressor(featuresCol='features', labelCol='ganancia')
pipeline_gb = Pipeline(stages=[string_indexer, vec_assembler, gb])
param_grid_gb = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [2, 5, 10, 15]) \
    .addGrid(rf.numTrees, [5, 10, 15]) \
    .build()\

evaluator = RegressionEvaluator(labelCol='ganancia', predictionCol='prediction')

cv_gb = CrossValidator(estimator=pipeline_gb, evaluator=evaluator, estimatorParamMaps=param_grid_gb, numFolds=3, seed=40)

In [0]:
run_name = f"gb_ganancia_{datetime.now().strftime('%Y%m%d_%H%M%S')}" 
with mlflow.start_run(run_name=run_name) as run:

    # Entrenamiento
    cv_model = cv_gb.fit(train_df)

    # Predicciones
    pred_df = cv_model.transform(test_df) # transform es equivalente al predict de scikit-learn

    # Métricas
    rmse = evaluator.evaluate(pred_df)
    r2_evaluator = evaluator.copy({}).setMetricName("r2")
    r2 = r2_evaluator.evaluate(pred_df)
    
    print(f"RMSE: {rmse}")
    print(f"R2: {r2}")

    # Logging a MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Logging hiperparámetros (opcional)
    avg_metrics = cv_model.avgMetrics
    param_maps = cv_model.getEstimatorParamMaps()
    for i, (params, metric) in enumerate(zip(param_maps, avg_metrics)):
        mlflow.log_metric(f"fold_{i}_metric", metric)

   # Guardamos el pipeline completo, que incluye al mejor modelo y las transformaciones del pipeline, necesarias para la predicción
    #mlflow.spark.log_model(cv_model, "full_pipeline")

    # Opcional: Logging del mejor modelo (última etapa del pipeline contiene el CV), vamos a necesitar el pipeline para predecir en cualquier caso
    best_model = cv_model.bestModel
    mlflow.spark.log_model(best_model, "best_model")

    gbt = cv_model.bestModel.stages[-1]
    importances = gbt.featureImportances.toArray()
    print(importances)

RMSE: 342.3719605771597
R2: 0.5999204733647583


2025/09/09 13:58:37 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/35 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

[0.02989911 0.13811919 0.11238584 0.24369005 0.06169668 0.07094247
 0.04540735 0.03349771 0.00786691 0.0545483  0.05197947 0.09447522
 0.05549171]


In [0]:
# lo que más usa, año, día de la semana y mes. Luego, presión, temperatura máxima y media. Vamos a probar sólo con esas.
assembler_inputs_new = ['tavg', 'tmin', 'tmax', 'pres', 'dia_semana', 'mes', 'ano']
vec_assembler_new = VectorAssembler(inputCols=assembler_inputs_new, outputCol='features')

In [0]:
gb = GBTRegressor(featuresCol='features', labelCol='ganancia')
pipeline_gb_new = Pipeline(stages=[vec_assembler_new, gb])
param_grid_gb = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [2, 5, 10, 15]) \
    .addGrid(rf.numTrees, [5, 10, 15]) \
    .build()\

evaluator = RegressionEvaluator(labelCol='ganancia', predictionCol='prediction')

cv_gb_new = CrossValidator(estimator=pipeline_gb_new, evaluator=evaluator, estimatorParamMaps=param_grid_gb, numFolds=3, seed=40)

In [0]:
run_name = f"gb_ganancia_{datetime.now().strftime('%Y%m%d_%H%M%S')}" 
with mlflow.start_run(run_name=run_name) as run:

    # Entrenamiento
    cv_model = cv_gb_new.fit(train_df)

    # Predicciones
    pred_df = cv_model.transform(test_df) # transform es equivalente al predict de scikit-learn

    # Métricas
    rmse = evaluator.evaluate(pred_df)
    r2_evaluator = evaluator.copy({}).setMetricName("r2")
    r2 = r2_evaluator.evaluate(pred_df)
    
    print(f"RMSE: {rmse}")
    print(f"R2: {r2}")

    # Logging a MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Logging hiperparámetros (opcional)
    avg_metrics = cv_model.avgMetrics
    param_maps = cv_model.getEstimatorParamMaps()
    for i, (params, metric) in enumerate(zip(param_maps, avg_metrics)):
        mlflow.log_metric(f"fold_{i}_metric", metric)

   # Guardamos el pipeline completo, que incluye al mejor modelo y las transformaciones del pipeline, necesarias para la predicción
    #mlflow.spark.log_model(cv_model, "full_pipeline")

    # Opcional: Logging del mejor modelo (última etapa del pipeline contiene el CV), vamos a necesitar el pipeline para predecir en cualquier caso
    best_model = cv_model.bestModel
    mlflow.spark.log_model(best_model, "best_model")

    gbt = cv_model.bestModel.stages[-1]
    importances = gbt.featureImportances.toArray()
    print(importances)

RMSE: 347.4313109343063
R2: 0.5880088774280954


2025/09/09 14:32:22 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/25 [00:00<?, ?it/s]



Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

[0.07821231 0.10213065 0.08651189 0.15977164 0.17734735 0.13019292
 0.26583325]


#### Predicción con gradient boosting

In [0]:
model_path = 'dbfs:/databricks/mlflow-tracking/3554617355181843/b66caf8248ba432aaa9accaa6df392fe/artifacts/best_model'
loaded_model = mlflow.spark.load_model(model_path)

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/39 [00:00<?, ?it/s]

In [0]:
assembler_inputs

['tipoIndex',
 'dia_semana',
 'mes',
 'ano',
 'tavg',
 'tmin',
 'tmax',
 'prcp',
 'snow',
 'wspd',
 'wpgt',
 'pres',
 'tsun']

In [0]:
new_df= spark.createDataFrame([{'tipo':'laborable', 'dia_semana': 4, 'mes': 9, 'ano': 2025, 'tavg': 19.2, 'tmin': 16.2, 'tmax': 24, 'prcp':0, 'snow':0, 'wspd': 10.8, 'wpgt': 25.9, 'pres': 1015, 'tsun': 450}]) # mira, le paso un diccionario

In [0]:
pred = loaded_model.transform(new_df)

In [0]:
pred.display()

ano,dia_semana,mes,prcp,pres,snow,tavg,tipo,tmax,tmin,tsun,wpgt,wspd,tipoIndex,features,prediction
2025,4,9,0,1015,0,19.2,laborable,24,16.2,450,25.9,10.8,0.0,"Map(vectorType -> dense, length -> 13, values -> List(0.0, 4.0, 9.0, 2025.0, 19.2, 16.2, 24.0, 0.0, 0.0, 10.8, 25.9, 1015.0, 450.0))",2006.2427684473728


In [0]:
pred.select('prediction').display()

prediction
2006.2427684473728
