In [0]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from mlflow.models.signature import infer_signature

# 1) Datos (ojo: toPandas() solo si cabe en memoria; aqu√≠ es gold agregado, suele ir ok)
pdf = spark.table("ecommerce.gold.products").select("views", "revenue", "purchases").toPandas()

features = ["views", "revenue"]
X = pdf[features]
y = pdf["purchases"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

with mlflow.start_run(run_name="linreg_purchases_v1"):
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("features", ",".join(features))
    mlflow.log_param("n_rows", len(pdf))

    model = LinearRegression()
    model.fit(X_train, y_train)

    preds = model.predict(X_test)

    r2 = r2_score(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)

    mlflow.log_metric("r2", r2)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)

    # 2) input_example + signature (quita el warning y documenta el modelo)
    input_example = X_test.head(5)
    signature = infer_signature(X_test, preds)

    mlflow.sklearn.log_model(
        model,
        artifact_path="model",
        input_example=input_example,
        signature=signature
    )

print(f"R2: {r2:.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f}")




R2: 0.7672 | RMSE: 73.8645 | MAE: 4.9868
