# 01 — End-to-End Pipeline with In-Pipeline Data Cleaning


This notebook trains a medical insurance charge predictor with a **leakage-safe Pipeline**:
- **Cleaning (in-pipeline)**: normalize categories, coerce numerics, domain-safe clamping, and imputation.
- **Feature Engineering**: `bmi^2`, `age×bmi`, `smoker×bmi`.
- **Preprocessing**: `StandardScaler` for numerics, `OneHotEncoder` for categoricals.
- **Model Zoo**: Linear / Ridge / ElasticNet / GBRT with small-grid CV.
- **Artifacts**: best model (`models/`), metrics JSON, subgroup audit JSON, and figures.


## 1) Imports & Paths

In [1]:

import os, json, math, pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, learning_curve
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib

PROJ_DIR    = os.getcwd()
DATA_PATH   = os.path.join(PROJ_DIR, "data", "insurance.csv")
REPORTS_DIR = os.path.join(PROJ_DIR, "reports")
FIG_DIR     = os.path.join(REPORTS_DIR, "figures")
MODELS_DIR  = os.path.join(PROJ_DIR, "models")

for d in [REPORTS_DIR, FIG_DIR, MODELS_DIR]:
    pathlib.Path(d).mkdir(parents=True, exist_ok=True)

print("Working dir:", PROJ_DIR)


Working dir: /Users/cedar/Documents/Python_Job interview/insurance-pricing/insurance-pricing/notebooks


## 2) Load Data

In [2]:

# Expected columns: age, sex, bmi, children, smoker, region, charges
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()


(1338, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## 3) Schema & In-Pipeline Cleaning

In [3]:

CATEGORICAL = ["sex", "smoker", "region"]
NUMERIC     = ["age", "bmi", "children"]
TARGET      = "charges"

# ---- In-pipeline data cleaning (top-level, picklable, no lambdas) ----
def clean_raw(X):
    """Domain-safe cleaning: types, categories, basic range clamps & do-not-leak defaults.
    - Normalize string categories (lower/strip) and map common variants
    - Coerce numeric types; clip to safe domain ranges (no quantiles)
    - The following imputation happens later inside ColumnTransformer
    """
    import pandas as pd
    X = X.copy()

    expected_cat = ["sex", "smoker", "region"]
    expected_num = ["age", "bmi", "children"]
    for col in expected_cat + expected_num:
        if col not in X.columns:
            X[col] = pd.NA

    def norm_str(s):
        if pd.isna(s): return s
        return str(s).strip().lower()

    for c in expected_cat:
        X[c] = X[c].map(norm_str)

    smoker_map = {"yes":"yes","y":"yes","true":"yes","1":"yes",
                  "no":"no","n":"no","false":"no","0":"no"}
    X["smoker"] = X["smoker"].map(lambda v: smoker_map.get(v, v))

    sex_map = {"male":"male","m":"male","female":"female","f":"female"}
    X["sex"] = X["sex"].map(lambda v: sex_map.get(v, v))

    def fix_region(v):
        if v is None or pd.isna(v): return v
        v = str(v).strip().lower().replace("_"," ").replace("-"," ")
        return v
    X["region"] = X["region"].map(fix_region)

    for c in expected_num:
        X[c] = pd.to_numeric(X[c], errors="coerce")

    # Domain-safe clamps (no quantiles to avoid leakage)
    X["age"]      = X["age"].clip(lower=0, upper=120)
    X["children"] = X["children"].clip(lower=0).round()
    X["bmi"]      = X["bmi"].clip(lower=10, upper=80)
    return X


## 4) Feature Engineering & Preprocessing

In [4]:

NEW_NUMERIC = ["bmi2", "age_bmi", "smoker_bmi"]

def add_interactions(Xd):
    Xd = Xd.copy()
    Xd["bmi2"]       = Xd["bmi"] ** 2
    Xd["age_bmi"]    = Xd["age"] * Xd["bmi"]
    smoker_num       = (Xd["smoker"] == "yes").astype(int)
    Xd["smoker_bmi"] = Xd["bmi"] * smoker_num
    return Xd

cleaner  = FunctionTransformer(clean_raw, validate=False)
feat_eng = FunctionTransformer(add_interactions, validate=False)

all_numeric = NUMERIC + NEW_NUMERIC

num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale",  StandardScaler()),
])
cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe",    OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer([
    ("num", num_pipe, all_numeric),
    ("cat", cat_pipe, CATEGORICAL),
])


## 5) Train/Test Split

In [5]:

X = df[CATEGORICAL + NUMERIC].copy()
y = df[TARGET].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

len(X_train), len(X_test)


(1070, 268)

## 6) Metrics Helper (RMSE-compatible with older scikit-learn)

In [6]:

def evaluate(y_true, y_pred):
    try:
        from sklearn.metrics import root_mean_squared_error
        rmse = float(root_mean_squared_error(y_true, y_pred))
    except Exception:
        rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    return {"r2": float(r2_score(y_true, y_pred)),
            "rmse": rmse,
            "mae": float(mean_absolute_error(y_true, y_pred))}


## 7) Model Zoo + CV + Selection

In [7]:

def build_model(name: str):
    if name == "linear":
        model = LinearRegression(); grid = {}
    elif name == "ridge":
        model = Ridge(random_state=42); grid = {"model__alpha": [0.1, 1.0, 10.0, 50.0]}
    elif name == "elasticnet":
        model = ElasticNet(random_state=42, max_iter=5000)
        grid = {"model__alpha": [0.001, 0.01, 0.1, 1.0], "model__l1_ratio": [0.2, 0.5, 0.8]}
    elif name == "gbrt":
        model = GradientBoostingRegressor(random_state=42)
        grid = {"model__n_estimators": [200], "model__learning_rate": [0.05, 0.1], "model__max_depth": [2, 3], "model__subsample": [0.9, 1.0]}
    else:
        raise ValueError(name)

    pipe = Pipeline([
        ("clean", cleaner),
        ("feat",  feat_eng),
        ("pre",   preprocess),
        ("model", model),
    ])
    return pipe, grid

models = ["linear", "ridge", "elasticnet", "gbrt"]
cv = KFold(n_splits=3, shuffle=True, random_state=42)

all_metrics = {}
best = {"name": None, "rmse": float("inf"), "estimator": None, "best_params": {}}

for name in models:
    pipe, grid = build_model(name)
    if grid:
        gs = GridSearchCV(pipe, grid, cv=cv, n_jobs=1, error_score="raise")
        gs.fit(X_train, y_train)
        est = gs.best_estimator_; params = gs.best_params_
    else:
        pipe.fit(X_train, y_train)
        est = pipe; params = {}
    pred = est.predict(X_test)
    m = evaluate(y_test, pred)
    all_metrics[name] = {"metrics": m, "best_params": params}
    if m["rmse"] < best["rmse"]:
        best = {"name": name, "rmse": m["rmse"], "estimator": est, "best_params": params}

best, all_metrics


({'name': 'gbrt',
  'rmse': 4330.045291563213,
  'estimator': Pipeline(steps=[('clean',
                   FunctionTransformer(func=<function clean_raw at 0x164bc5a80>)),
                  ('feat',
                   FunctionTransformer(func=<function add_interactions at 0x164bc5c60>)),
                  ('pre',
                   ColumnTransformer(transformers=[('num',
                                                    Pipeline(steps=[('impute',
                                                                     SimpleImputer(strategy='median')),
                                                                    ('scale',
                                                                     StandardScaler())]),
                                                    ['age', 'bmi', 'children',
                                                     'bmi2', 'age_bmi',
                                                     'smoker_bmi']),
                                                   ('cat

## 8) Save Artifacts (Model + Metrics)

In [8]:

def to_serializable(obj):
    if isinstance(obj, (np.floating, np.integer)): return obj.item()
    if isinstance(obj, (np.ndarray,)):            return obj.tolist()
    if isinstance(obj, float) and (math.isnan(obj) or math.isinf(obj)): return None
    return obj
def map_json(o):
    if isinstance(o, dict): return {k: map_json(v) for k, v in o.items()}
    if isinstance(o, list): return [map_json(v) for v in o]
    return to_serializable(o)

model_path = os.path.join(MODELS_DIR, f"best_{best['name']}.pkl")
joblib.dump(best["estimator"], model_path)

payload = map_json({"per_model": all_metrics, "best_model": best["name"], "best_params": best["best_params"]})
with open(os.path.join(REPORTS_DIR, "metrics_enhanced.json"), "w", encoding="utf-8") as f:
    json.dump(payload, f, indent=2, ensure_ascii=False)

print("Saved model:", model_path)


Saved model: /Users/cedar/Documents/Python_Job interview/insurance-pricing/insurance-pricing/notebooks/models/best_gbrt.pkl


## 9) Subgroup Audit (Fairness/Robustness)

In [9]:

from sklearn.metrics import mean_squared_error, mean_absolute_error

def subgroup_errors(estimator, X_test, y_test, group):
    df_eval = X_test.copy()
    df_eval["__y_true__"] = y_test.values
    df_eval["__y_pred__"] = estimator.predict(X_test)
    res = {}
    for g, sub in df_eval.groupby(group):
        rmse = float(np.sqrt(mean_squared_error(sub["__y_true__"], sub["__y_pred__"])))
        mae  = float(mean_absolute_error(sub["__y_true__"], sub["__y_pred__"]))
        res[str(g)] = {"rmse": rmse, "mae": mae, "n": int(len(sub))}
    return res

subgroups = {
    "smoker": subgroup_errors(best["estimator"], X_test, y_test, "smoker"),
    "region": subgroup_errors(best["estimator"], X_test, y_test, "region"),
}
with open(os.path.join(REPORTS_DIR, "subgroup_metrics.json"), "w", encoding="utf-8") as f:
    json.dump(subgroups, f, indent=2)

subgroups


{'smoker': {'no': {'rmse': 4224.611472377178,
   'mae': 2441.9887118958195,
   'n': 214},
  'yes': {'rmse': 4724.794937798383, 'mae': 2484.613637390441, 'n': 54}},
 'region': {'northeast': {'rmse': 4129.655981094355,
   'mae': 2558.7697142384545,
   'n': 57},
  'northwest': {'rmse': 4962.271473757973, 'mae': 2421.712340899857, 'n': 69},
  'southeast': {'rmse': 3460.0686077585547, 'mae': 2310.030077949855, 'n': 81},
  'southwest': {'rmse': 4763.118881601477, 'mae': 2568.758347822441, 'n': 61}}}

## 10) Figures (Residuals & Learning Curve)

In [10]:

# Residuals vs Predicted
yhat = best["estimator"].predict(X_test)
resid = y_test - yhat
plt.figure(); plt.scatter(yhat, resid, alpha=0.6); plt.axhline(0, linestyle="--")
plt.xlabel("Predicted charges"); plt.ylabel("Residuals (y - ŷ)"); plt.title("Residuals vs Predicted (Hold-out)")
plt.savefig(os.path.join(FIG_DIR, "residuals_vs_pred.png"), bbox_inches="tight"); plt.close()

# Learning curve
train_sizes, train_scores, val_scores = learning_curve(
    best["estimator"], df[CATEGORICAL + NUMERIC], df[TARGET],
    cv=3, scoring="neg_mean_squared_error", train_sizes=np.linspace(0.1, 1.0, 8),
    n_jobs=1, shuffle=True, random_state=42
)
mean_train_rmse = np.sqrt(-train_scores).mean(axis=1)
mean_val_rmse   = np.sqrt(-val_scores).mean(axis=1)

plt.figure()
plt.plot(train_sizes, mean_train_rmse, marker="o", label="Train RMSE")
plt.plot(train_sizes, mean_val_rmse, marker="s", label="CV RMSE")
plt.xlabel("Training samples"); plt.ylabel("RMSE"); plt.legend()
plt.title("Learning Curve (Best Model)")
plt.savefig(os.path.join(FIG_DIR, "learning_curve.png"), bbox_inches="tight"); plt.close()

"Saved figures to reports/figures/"


'Saved figures to reports/figures/'

## 11) Mini Model Card (Text Summary)

In [11]:

bm = best["name"]
m  = all_metrics[bm]["metrics"]
base = all_metrics["linear"]["metrics"]
def pct_drop(b, n): 
    return 100.0 * (b - n) / b if b > 0 else 0.0

summary = {
    "best_model": bm,
    "holdout": {
        "r2": round(m["r2"], 4),
        "rmse": round(m["rmse"], 2),
        "mae": round(m["mae"], 2),
    },
    "vs_linear": {
        "rmse_reduction_pct": round(pct_drop(base["rmse"], m["rmse"]), 2),
        "mae_reduction_pct":  round(pct_drop(base["mae"],  m["mae"]),  2),
    }
}
summary


{'best_model': 'gbrt',
 'holdout': {'r2': 0.8792, 'rmse': 4330.05, 'mae': 2450.58},
 'vs_linear': {'rmse_reduction_pct': 4.5, 'mae_reduction_pct': 10.57}}