In [0]:
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# 1) Load data from Unity Catalog (Gold)
pdf = spark.table("ecommerce.gold.products").select("views", "revenue", "purchases").toPandas()

# 2) Basic cleaning (important for sklearn)
pdf = pdf.dropna(subset=["views", "revenue", "purchases"])

X = pdf[["views", "revenue"]]
y = pdf["purchases"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "random_forest": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
}

mlflow.set_experiment("/Users/pedelvicl@gmail.com/mlflow_day13")

for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model"):
        # Log parameters (lo que define tu modelo)
        mlflow.log_param("model_type", name)
        mlflow.log_param("features", "views,revenue")
        mlflow.log_param("test_size", 0.2)
        mlflow.log_param("random_state", 42)

        # Si el modelo tiene hiperparámetros, los registramos
        if hasattr(model, "max_depth"):
            mlflow.log_param("max_depth", model.max_depth)
        if hasattr(model, "n_estimators"):
            mlflow.log_param("n_estimators", model.n_estimators)

        # Train
        model.fit(X_train, y_train)

        # Predict + Metrics
        preds = model.predict(X_test)
        r2 = r2_score(y_test, preds)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        mae = mean_absolute_error(y_test, preds)

        mlflow.log_metric("r2", float(r2))
        mlflow.log_metric("rmse", float(rmse))
        mlflow.log_metric("mae", float(mae))

        # Log model (con input_example para evitar warnings y tener “firma”)
        input_example = X_train.head(5)
        mlflow.sklearn.log_model(model, artifact_path="model", input_example=input_example)

        print(f"{name}: R2={r2:.4f} | RMSE={rmse:.4f} | MAE={mae:.4f}")


2026/01/22 22:08:42 INFO mlflow.tracking.fluent: Experiment with name '/Users/pedelvicl@gmail.com/mlflow_day13' does not exist. Creating a new experiment.


linear: R2=0.7672 | RMSE=73.8645 | MAE=4.9868




decision_tree: R2=0.7786 | RMSE=72.0230 | MAE=2.4405




random_forest: R2=0.7043 | RMSE=83.2356 | MAE=1.9592
