In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, mean_squared_error

# 1) Cargar datos
df = pd.read_csv("https://github.com/ulewis/Ejemplos/raw/main/Datos/healthcare_dataset.csv")

# 2) Separar target y features
y = df["Billing Amount"]   # columna objetivo
X = df.drop(columns=["Billing Amount"], errors="ignore")

# 3) Identificar columnas categóricas y numéricas
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

# 4) Preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# 5) Pipeline con polinómicas + RidgeCV
alphas = np.logspace(-3, 3, 50)
model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("ridge", RidgeCV(alphas=alphas, cv=10))
])

# 6) Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 7) Entrenar modelo
model.fit(X_train, y_train)

# 8) Evaluar
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R^2 test:  {r2:.5f}")
print(f"RMSE test: {rmse:,.2f}")
print(f"Mejor alpha (RidgeCV): {model.named_steps['ridge'].alpha_:.5f}")