In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

# Rutas
CWD = Path.cwd().resolve()
PROJECT_ROOT = CWD.parent if CWD.name == "notebooks" else CWD
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

# Cargar datasets limpios
train_clean = pd.read_csv(DATA_PROCESSED / "train_clean.csv", keep_default_na=False)
test_clean  = pd.read_csv(DATA_PROCESSED / "test_clean.csv", keep_default_na=False)

print("train_clean:", train_clean.shape, "| test_clean:", test_clean.shape)
assert "SalePrice" in train_clean.columns and "SalePrice" not in test_clean.columns

# Definir X / y (trabajaremos en log para la métrica)
y = train_clean["SalePrice"].astype(float)
y_log = np.log1p(y)
X = train_clean.drop(columns=["SalePrice"])

print("X columns:", X.shape[1], "| y_log:", y_log.shape)

# Métrica y CV (guardamos criterios)
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

CV = KFold(n_splits=5, shuffle=True, random_state=42)
def rmse_log(y_true_log, y_pred_log):
    return mean_squared_error(y_true_log, y_pred_log, squared=False)

print("[ok] métrica: RMSE en log (≈ RMSLE) | CV=5, shuffle, seed=42")


train_clean: (1458, 81) | test_clean: (1459, 80)
X columns: 80 | y_log: (1458,)
[ok] métrica: RMSE en log (≈ RMSLE) | CV=5, shuffle, seed=42


In [2]:
# Columnas numéricas (ya imputadas/transformadas en el EDA)
num_cols = X.select_dtypes(include=["number"]).columns.tolist()

# Candidatas ordinales textuales (presentes en este dataset)
# (Mantenemos OverallQual/OverallCond como numéricas: ya son enteros ordinales)
ord_text_candidates = ["ExterQual", "BsmtQual", "KitchenQual", "FireplaceQu", "GarageFinish"]
ord_text_cols = [c for c in ord_text_candidates if c in X.columns]

# Mapeos de orden (incluye 'None_' al final para coherencia con imputación)
ORD_MAP = {
    "ExterQual":   ["Ex", "Gd", "TA", "Fa", "Po", "None_"],
    "BsmtQual":    ["Ex", "Gd", "TA", "Fa", "Po", "None_"],
    "KitchenQual": ["Ex", "Gd", "TA", "Fa", "Po", "None_"],
    "FireplaceQu": ["Ex", "Gd", "TA", "Fa", "Po", "None_"],
    "GarageFinish":["Fin","RFn","Unf","None_"],
}
# Filtrar el mapping solo a columnas presentes
ORD_MAP = {c: [v for v in vals if (c in ord_text_cols)] for c, vals in ORD_MAP.items()}

# Nominales = objeto - ordinales_texto
obj_cols = X.select_dtypes(include=["object"]).columns.tolist()
cat_nom_cols = sorted(list(set(obj_cols) - set(ord_text_cols)))

# Chequeos
overlap1 = set(num_cols) & set(ord_text_cols)
overlap2 = set(ord_text_cols) & set(cat_nom_cols)
assert not overlap1 and not overlap2, "Solapamiento inesperado entre grupos"

print(f"#num: {len(num_cols)} | #ord_text: {len(ord_text_cols)} | #nom: {len(cat_nom_cols)}")
print("ord_text_cols:", ord_text_cols)
print("nominal sample:", cat_nom_cols[:10])


#num: 37 | #ord_text: 5 | #nom: 38
ord_text_cols: ['ExterQual', 'BsmtQual', 'KitchenQual', 'FireplaceQu', 'GarageFinish']
nominal sample: ['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'CentralAir', 'Condition1', 'Condition2', 'Electrical']


In [3]:
# --- Preprocesador: ColumnTransformer (ordinal + one-hot + num) ---
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from inspect import signature

# Categorías en el orden correcto para cada columna ordinal (mismos índices que ord_text_cols)
ord_categories = []
for col in ord_text_cols:
    ord_categories.append(ORD_MAP[col])

# OneHotEncoder compatible con cualquier versión de sklearn
if "sparse_output" in signature(OneHotEncoder).parameters:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
else:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

# Escalado opcional de numéricas (recomendado para lineales)
SCALE_NUM = True
num_transform = StandardScaler() if SCALE_NUM else "passthrough"

preprocess = ColumnTransformer(
    transformers=[
        ("ord", OrdinalEncoder(
            categories=ord_categories,
            handle_unknown="use_encoded_value",
            unknown_value=-1
        ), ord_text_cols),
        ("nom", ohe, cat_nom_cols),
        ("num", num_transform, num_cols),
    ],
    remainder="drop",
    n_jobs=None,
)

print("[ok] preprocess listo")


[ok] preprocess listo


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

pipe_lr = Pipeline(steps=[
    ("prep", preprocess),
    ("model", LinearRegression())
])

scores = cross_val_score(pipe_lr, X, y_log,
                         scoring="neg_root_mean_squared_error",
                         cv=CV, n_jobs=None)
rmse_mean = -scores.mean()
rmse_std  =  scores.std()
print(f"[Linear] RMSE_log CV5: {rmse_mean:.4f} ± {rmse_std:.4f}")


[Linear] RMSE_log CV5: 0.1350 ± 0.0186


In [5]:
# === Snippet C — Baselines 2–4: RidgeCV / LassoCV / ElasticNetCV ===
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

results = []

# 1) RidgeCV (selección de alpha interna por GCV si cv=None)
ridge_alphas = np.logspace(-3, 3, 13)
pipe_ridge = Pipeline([
    ("prep", preprocess),
    ("model", RidgeCV(alphas=ridge_alphas))  # <— sin store_cv_values
])
scores = cross_val_score(pipe_ridge, X, y_log, scoring="neg_root_mean_squared_error", cv=CV)
results.append(("RidgeCV", -scores.mean(), scores.std()))

# 2) LassoCV (CV interna para elegir alpha)
lasso_alphas = np.logspace(-4, 1, 12)
pipe_lasso = Pipeline([
    ("prep", preprocess),
    ("model", LassoCV(alphas=lasso_alphas, max_iter=10000, n_jobs=None))
])
scores = cross_val_score(pipe_lasso, X, y_log, scoring="neg_root_mean_squared_error", cv=CV)
results.append(("LassoCV", -scores.mean(), scores.std()))

# 3) ElasticNetCV (CV interna para alpha y l1_ratio)
l1_grid = [0.15, 0.3, 0.5, 0.7, 0.9]
en_alphas = np.logspace(-4, 1, 12)
pipe_en = Pipeline([
    ("prep", preprocess),
    ("model", ElasticNetCV(alphas=en_alphas, l1_ratio=l1_grid, max_iter=10000, n_jobs=None))
])
scores = cross_val_score(pipe_en, X, y_log, scoring="neg_root_mean_squared_error", cv=CV)
results.append(("ElasticNetCV", -scores.mean(), scores.std()))

res_df = pd.DataFrame(results, columns=["modelo", "rmse_log_mean", "rmse_log_std"]).sort_values("rmse_log_mean")
display(res_df)


Unnamed: 0,modelo,rmse_log_mean,rmse_log_std
2,ElasticNetCV,0.112072,0.006943
1,LassoCV,0.112739,0.007374
0,RidgeCV,0.113146,0.007696


In [6]:
# Incluir el Linear en la tabla para ver todo junto
all_rows = [("LinearRegression", rmse_mean, rmse_std)]
all_rows += results
summary = pd.DataFrame(all_rows, columns=["modelo", "rmse_log_mean", "rmse_log_std"]).sort_values("rmse_log_mean")
print("== Resumen CV (RMSE en log) ==")
display(summary)

best_name = summary.iloc[0]["modelo"]
print(f"[mejor provisional] {best_name}")


== Resumen CV (RMSE en log) ==


Unnamed: 0,modelo,rmse_log_mean,rmse_log_std
3,ElasticNetCV,0.112072,0.006943
2,LassoCV,0.112739,0.007374
1,RidgeCV,0.113146,0.007696
0,LinearRegression,0.135003,0.018569


[mejor provisional] ElasticNetCV


In [7]:
# Re-evaluamos 4 baselines y elegimos el mejor por RMSE_log (más chico = mejor)
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
import numpy as np, pandas as pd

candidates = {
    "LinearRegression": LinearRegression(),
    "RidgeCV": RidgeCV(alphas=np.logspace(-3, 3, 13)),
    "LassoCV": LassoCV(alphas=np.logspace(-4, 1, 12), max_iter=10000, n_jobs=None),
    "ElasticNetCV": ElasticNetCV(alphas=np.logspace(-4, 1, 12), l1_ratio=[0.15,0.3,0.5,0.7,0.9], max_iter=10000, n_jobs=None),
}

rows = []
for name, est in candidates.items():
    pipe = Pipeline([("prep", preprocess), ("model", est)])
    scores = cross_val_score(pipe, X, y_log, scoring="neg_root_mean_squared_error", cv=CV)
    rows.append((name, -scores.mean(), scores.std()))
res_df = pd.DataFrame(rows, columns=["modelo","rmse_log_mean","rmse_log_std"]).sort_values("rmse_log_mean")
display(res_df)

best_name = res_df.iloc[0]["modelo"]
print(f"[mejor] {best_name}")
best_est = candidates[best_name]


Unnamed: 0,modelo,rmse_log_mean,rmse_log_std
3,ElasticNetCV,0.112072,0.006943
2,LassoCV,0.112739,0.007374
1,RidgeCV,0.113146,0.007696
0,LinearRegression,0.135003,0.018569


[mejor] ElasticNetCV


In [8]:
# Fit full y predicción
from sklearn.pipeline import Pipeline
import numpy as np

best_pipe = Pipeline([("prep", preprocess), ("model", best_est)])
best_pipe.fit(X, y_log)  # entrenamos en TODO el train (target en log)

# Predicciones en log y vuelta a escala original
X_test = test_clean.copy()
y_pred_log = best_pipe.predict(X_test)
y_pred = np.expm1(y_pred_log)

# Seguridad: sin negativos ni NaN
import numpy as np
y_pred = np.clip(y_pred, a_min=1.0, a_max=None)
assert not np.isnan(y_pred).any(), "Hay NaN en predicciones"

print("[ok] modelo entrenado y predicciones listas:", y_pred.shape)


[ok] modelo entrenado y predicciones listas: (1459,)


In [9]:
from pathlib import Path
import pandas as pd
from datetime import datetime

# Carpeta de salidas
CWD = Path.cwd().resolve()
PROJECT_ROOT = CWD.parent if CWD.name == "notebooks" else CWD
SUB_DIR = PROJECT_ROOT / "submissions"
SUB_DIR.mkdir(parents=True, exist_ok=True)

# DataFrame de envío (Id + SalePrice)
assert "Id" in test_clean.columns, "test_clean debe contener la columna Id"
submission = pd.DataFrame({"Id": test_clean["Id"].values, "SalePrice": y_pred})

# Guardar con nombre fijo y con timestamp
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
sub_fixed = SUB_DIR / "submission_baseline.csv"
sub_time  = SUB_DIR / f"submission_baseline_{best_name}_{stamp}.csv"
submission.to_csv(sub_fixed, index=False)
submission.to_csv(sub_time, index=False)

# Verificación rápida
print(f"[save] {sub_fixed}")
print(f"[save] {sub_time}")
print("[check] filas:", submission.shape[0], "| cols:", submission.shape[1])
display(submission.head(5))
print("[range] SalePrice min/max:", float(submission.SalePrice.min()), float(submission.SalePrice.max()))


[save] /mnt/d/Data/0_Ordenado/LM/IA/fast-ia/proyectos/house-prices/submissions/submission_baseline.csv
[save] /mnt/d/Data/0_Ordenado/LM/IA/fast-ia/proyectos/house-prices/submissions/submission_baseline_ElasticNetCV_20250910_113817.csv
[check] filas: 1459 | cols: 2


Unnamed: 0,Id,SalePrice
0,1461,118661.485193
1,1462,156405.528362
2,1463,178951.13683
3,1464,199153.389795
4,1465,197608.053917


[range] SalePrice min/max: 44542.192152338226 959914.0581070407


In [10]:
from pathlib import Path
import joblib
import numpy as np

# Rutas
CWD = Path.cwd().resolve()
PROJECT_ROOT = CWD.parent if CWD.name == "notebooks" else CWD
MODELS_DIR = PROJECT_ROOT / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# Guardado
model_path = MODELS_DIR / f"baseline_{best_name}.joblib"
joblib.dump(best_pipe, model_path)
print(f"[save] {model_path}")

# Smoke test de carga y predicción (log-space)
pipe_loaded = joblib.load(model_path)
mini_pred_log = pipe_loaded.predict(test_clean.head(3))
print("[check] pred.shape:", mini_pred_log.shape)
print("[check] sample pred (log):", mini_pred_log[:3])


[save] /mnt/d/Data/0_Ordenado/LM/IA/fast-ia/proyectos/house-prices/models/baseline_ElasticNetCV.joblib
[check] pred.shape: (3,)
[check] sample pred (log): [11.68403848 11.96021385 12.09487366]


In [11]:
import numpy as np
import pandas as pd

pre = best_pipe.named_steps["prep"]

# Intento directo (sklearn >= 1.0)
try:
    feat_names = pre.get_feature_names_out()
except Exception:
    # Fallback si algún transformador no implementa get_feature_names_out
    feat_names = []
    for name, trans, cols in pre.transformers_:
        if name == "remainder":
            continue
        if hasattr(trans, "get_feature_names_out"):
            try:
                fn = list(trans.get_feature_names_out(cols))
            except Exception:
                fn = [f"{name}__{c}" for c in cols]
        else:
            fn = [f"{name}__{c}" for c in cols]
        feat_names.extend(fn)
    feat_names = np.array(feat_names, dtype=object)

FEATURES_CSV = MODELS_DIR / f"feature_names_{best_name}.csv"
pd.Series(feat_names, name="feature").to_csv(FEATURES_CSV, index=False)
print(f"[save] {FEATURES_CSV} | n_features={len(feat_names)}")


[save] /mnt/d/Data/0_Ordenado/LM/IA/fast-ia/proyectos/house-prices/models/feature_names_ElasticNetCV.csv | n_features=284


In [12]:
# === PASO 17.3 (FIX) — Guardar metadatos del experimento (JSON serializable) ===
import json, platform, sys, sklearn, numpy as np, pandas as pd
from datetime import datetime
from pathlib import Path

META_JSON = MODELS_DIR / f"metadata_{best_name}.json"

# Conversor robusto: NumPy/Pandas -> tipos JSON
def to_jsonable(o):
    # NumPy escalares
    if isinstance(o, (np.integer,)):
        return int(o)
    if isinstance(o, (np.floating,)):
        return float(o)
    if isinstance(o, (np.bool_,)):
        return bool(o)
    # NumPy arrays
    if isinstance(o, (np.ndarray,)):
        return o.tolist()
    # Pandas
    if isinstance(o, (pd.Series, pd.Index)):
        return o.tolist()
    # Sets u otros
    if isinstance(o, set):
        return list(o)
    # Último recurso: string (para objetos no serializables)
    return str(o)

# Resultados CV (si existe la tabla)
try:
    cv_rows = res_df.to_dict(orient="records")
except Exception:
    cv_rows = []

# Hiperparámetros del modelo (puede incluir arrays -> se serializan con default=)
try:
    model_params = best_pipe.named_steps["model"].get_params()
except Exception:
    model_params = {}

# Listas de columnas (si están en memoria)
safe = lambda v: v if isinstance(v, (list, dict)) else None
meta_cols = {
    "num_cols": safe(locals().get("num_cols")),
    "ord_text_cols": safe(locals().get("ord_text_cols")),
    "cat_nom_cols": safe(locals().get("cat_nom_cols")),
    "ORD_MAP": safe(locals().get("ORD_MAP")),
}

metadata = {
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "system": {
        "python": sys.version.split()[0],
        "platform": platform.platform(),
        "sklearn": sklearn.__version__,
        "numpy": np.__version__,
        "pandas": pd.__version__,
    },
    "cv": {
        "strategy": "KFold",
        "n_splits": 5,
        "shuffle": True,
        "random_state": 42,
        "results": cv_rows,  # media/std por modelo
    },
    "data": {
        "train_shape": list(X.shape) if "X" in locals() else None,
        "test_shape": list(test_clean.shape) if "test_clean" in locals() else None,
        "target": "SalePrice (entrenado en log1p)",
    },
    "pipeline": {
        "best_name": str(best_name),
        "model_class": best_pipe.named_steps["model"].__class__.__name__,
        "model_params": model_params,         # <- puede tener arrays (se manejan con to_jsonable)
        "preprocess_type": best_pipe.named_steps["prep"].__class__.__name__,
        "columns": meta_cols,
    },
}

with open(META_JSON, "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2, default=to_jsonable)

print(f"[save] {META_JSON}")


[save] /mnt/d/Data/0_Ordenado/LM/IA/fast-ia/proyectos/house-prices/models/metadata_ElasticNetCV.json


In [13]:
from datetime import datetime
from pathlib import Path

CARD = PROJECT_ROOT / "reports" / f"model_card_{best_name}.md"
CARD.parent.mkdir(parents=True, exist_ok=True)

card = f"""# Model Card — {best_name}
Fecha: {datetime.now().strftime('%Y-%m-%d %H:%M')}

## Objetivo
Predecir SalePrice (House Prices — Kaggle). Entrenado en log1p(SalePrice).

## Datos
- Train limpio: data/processed/train_clean.csv (sin nulos)
- Test limpio : data/processed/test_clean.csv (sin nulos)
- Outliers removidos en train (2 casos en GrLivArea)

## Preprocesamiento
- OrdinalEncoder en: {locals().get('ord_text_cols', [])}
- OneHotEncoder en nominales; StandardScaler en numéricas
- Log1p en features sesgadas (según reports/skewed_feats.txt)
- MSSubClass como categórica (string)

## Validación y métrica
- CV=5 (shuffle=True, seed=42)
- Métrica: RMSE en log (≈ RMSLE)
- Resultados CV (resumen): {locals().get('res_df').to_dict(orient='records') if 'res_df' in locals() else 'N/D'}

## Modelo
- {best_name} ({best_pipe.named_steps['model'].__class__.__name__})
- Parámetros: ver models/metadata_{best_name}.json

## Limitaciones y supuestos
- Distribución y relaciones condicionadas por el dataset original
- Transformaciones log1p pueden afectar interpretabilidad directa
- One-Hot puede generar alta dimensionalidad

## Uso previsto
- Benchmark / baseline reproducible para la competencia
- Base para mejorar con regularización/end-to-end/boosting
"""
CARD.write_text(card, encoding="utf-8")
print(f"[save] {CARD}")


[save] /mnt/d/Data/0_Ordenado/LM/IA/fast-ia/proyectos/house-prices/reports/model_card_ElasticNetCV.md


In [14]:
import joblib
import numpy as np

pipe_re = joblib.load(MODELS_DIR / f"baseline_{best_name}.joblib")
pred_log_re = pipe_re.predict(test_clean)
assert pred_log_re.shape[0] == test_clean.shape[0]
pred_re = np.expm1(pred_log_re)
print("[OK] reload + predict:", pred_re.shape, "| min/max:", float(pred_re.min()), float(pred_re.max()))


[OK] reload + predict: (1459,) | min/max: 44542.192152338226 959914.0581070407
