
# 🧱 Sklearn Feature Engineering Pipeline + Grid Search (Ames Housing)

This notebook builds a full **feature-engineering pipeline** for the **Ames Housing** dataset using **scikit-learn**.
It includes:

- Custom outlier cleaning that **calls the provided** `clean_outliers(df_in, method="cap", k=1.5)`
- Missing-value handling for **numeric** and **categorical** columns
- Encoding and optional scaling
- **`Pipeline` + `ColumnTransformer`** integration
- **`GridSearchCV`** over outlier parameters, imputation, scaling, and model hyperparameters
- Train/validation report with RMSE & \(R^2\)

> **Note on `method="remove"`**: The original `clean_outliers` function can **drop rows** when `method="remove"`. 
> Standard scikit-learn `Pipeline` objects expect transformers to **preserve the number of samples** (so that `y` stays aligned).
> To keep everything pipeline-safe, our wrapper only uses **`cap`** and **`median`** during grid search.  
> If you request `"remove"`, we **safely map** it to `"median"` internally and print a warning.


In [1]:

# --- Imports ---
import warnings
from typing import Optional

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.compose import make_column_selector as selector

RANDOM_STATE = 42


In [2]:

# --- Load the CSV exactly as requested ---
csv_path = "Ames_Housing_Data.csv"
df       = pd.read_csv(csv_path)

df.head(20)

Unnamed: 0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,AllPub,...,0,,,,0,5,2010,WD,Normal,215000
1,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,244000
4,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,189900
5,527105030,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,195500
6,527127150,120,RL,41.0,4920,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,213500
7,527145080,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,0,,,,0,1,2010,WD,Normal,191500
8,527146030,120,RL,39.0,5389,Pave,,IR1,Lvl,AllPub,...,0,,,,0,3,2010,WD,Normal,236500
9,527162130,60,RL,60.0,7500,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2010,WD,Normal,189000


In [3]:
# Target
TARGET = "SalePrice"

# Identify numeric and textual columns:
numeric_columns = df.select_dtypes(include=["number"]).columns.tolist()
text_columns    = df.select_dtypes(include=["object"]).columns.tolist()


print(f"Numeric columns: {numeric_columns}")
print(f"Text columns: {text_columns}")

Numeric columns: ['PID', 'MS SubClass', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val', 'Mo Sold', 'Yr Sold', 'SalePrice']
Text columns: ['MS Zoning', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating', 'Heating QC',

In [4]:
# Separate features/target
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

X_train.head()


Unnamed: 0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition
381,527359050,20,RL,80.0,10400,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,6,2009,WD,Family
834,906475070,60,RL,,28698,Pave,,IR2,Low,AllPub,...,225,0,,,,0,6,2009,WD,Abnorml
1898,534429030,90,RL,70.0,9842,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,3,2007,WD,Normal
678,535451170,90,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2009,WD,Normal
700,902109130,190,RM,63.0,7627,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,10,2009,WD,Normal


In [5]:
X_test.head()

Unnamed: 0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition
1357,903427090,70,RM,,5100,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,6,2008,WD,Normal
2367,527450460,160,RM,21.0,1890,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,7,2006,WD,Normal
2822,908128100,60,RL,62.0,7162,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2006,WD,Normal
2126,907135180,20,RL,60.0,8070,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,8,2007,WD,Normal
1544,910200080,30,RM,50.0,7000,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,7,2008,WD,Normal


## Provided function: `clean_outliers`

In [6]:
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype

def clean_outliers(df_in: pd.DataFrame, method: str = "cap", k: float = 1.5,
                   int_policy: str = "round"):
    """
    method: "cap" (winsorize) or "remove" (optional to implement)
    int_policy: how to convert floats back to int for originally-integer columns:
        "round" | "floor" | "ceil" | "keep_float"
    """
    df = df_in.copy()

    for col in df.select_dtypes(include="number").columns:
        s = df[col]

        # Do math in float to avoid dtype-mismatch during assignment
        work = s.astype("float64")

        if method == "cap":
            q1, q3 = work.quantile([0.25, 0.75])
            iqr = q3 - q1
            low = q1 - k * iqr
            high = q3 + k * iqr
            work = work.clip(lower=low, upper=high)

        # … add other methods here if you use them …

        # If original column was integer, convert back as requested
        if is_integer_dtype(s.dtype):
            if int_policy == "keep_float":
                df[col] = work
            elif int_policy == "floor":
                df[col] = np.floor(work).astype(s.dtype)
            elif int_policy == "ceil":
                df[col] = np.ceil(work).astype(s.dtype)
            else:  # "round"
                df[col] = np.round(work).astype(s.dtype)
        else:
            df[col] = work

    return df



## Pipeline-safe wrapper: `OutlierCleaner`

This wrapper **calls `clean_outliers`** but **never changes** the number of rows so the pipeline stays valid.
- Supports `method in {"cap", "median"}` directly.
- If `method == "remove"`, it **falls back to `"median"` and warns** (to keep sample count fixed).


In [7]:

class OutlierCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, method: str = "cap", k: float = 1.5, numeric_only: bool = True):
        self.method = method
        self.k = k
        self.numeric_only = numeric_only

    def fit(self, X, y=None):
        # nothing to learn
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            # Make sure downstream transformers (like ColumnTransformer) still get a DataFrame
            X = pd.DataFrame(X)

        method = self.method
        if method not in {"cap", "median", "remove"}:
            raise ValueError(f"Unsupported method: {method}. Use 'cap', 'median', or 'remove'.")

        # If 'remove' is requested, map to 'median' to avoid changing n_samples
        if method == "remove":
            warnings.warn("OutlierCleaner: 'remove' would drop samples; mapping to 'median' for pipeline safety.")
            method = "median"

        # Optionally restrict to numeric columns only (recommended)
        if self.numeric_only:
            num_cols = X.select_dtypes(include="number").columns
            X_num = X[num_cols]
            X_num_clean = clean_outliers(X_num, method=method, k=self.k)
            X_clean = X.copy()
            X_clean[num_cols] = X_num_clean[num_cols]
            return X_clean
        else:
            return clean_outliers(X, method=method, k=self.k)


## Load data & define columns


## Preprocessing blocks

- **OutlierCleaner** (custom) → numeric columns only
- **Numeric pipeline** → imputer (`mean`/`median`) + optional scaler
- **Categorical pipeline** → imputer (`most_frequent`/`constant`) + one-hot encoding
- Combined via **ColumnTransformer**


In [8]:
# Numeric pipeline: impute -> (optional) scale
numeric_pre = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),   # grid will try mean/median
    ("scaler",  StandardScaler(with_mean=True, with_std=True))  # can be toggled via grid
])

# Categorical pipeline: impute -> one-hot
categorical_pre = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent", fill_value="missing")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])


In [9]:

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pre, selector(dtype_include=np.number)),
        ("cat", categorical_pre, selector(dtype_include=object)),
    ],
    remainder="drop",
    verbose_feature_names_out=True
)


In [10]:


# Full preprocessor
# Final pipeline: OutlierCleaner -> Preprocessor -> Model
pipe = Pipeline(steps=[
    ("outliers", OutlierCleaner(method="cap", k=1.5, numeric_only=True)),
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(random_state=RANDOM_STATE))
])


pipe


0,1,2
,steps,"[('outliers', ...), ('preprocess', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,method,'cap'
,k,1.5
,numeric_only,True

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True



## Grid Search space

We search across:
- Outlier cleaning: `method ∈ {cap, median}`, `k ∈ {1.0, 1.5, 2.0, 3.0}`
- Numeric imputer: `mean` vs `median`
- Scaling: **enabled** vs **disabled** (by swapping scaler with `passthrough`)
- Categorical imputer: `most_frequent` vs `constant`
- Model family & hyperparameters:
  - **RandomForestRegressor** (n_estimators, max_depth, max_features)
  - **Ridge** (alpha)
  
> Tip: You can expand or reduce the grid to fit your compute budget.


In [11]:

# Helper to toggle scaler in the numeric pipeline
from sklearn import set_config
set_config(transform_output="pandas")  # get DataFrame from transformers for readability

param_grid = [
    # --- RandomForest branch ---
    {
        "outliers__method": ["cap", "median"],
        "preprocess__cat__imputer__strategy": ["most_frequent", "constant"],
        "preprocess__cat__imputer__fill_value": ["missing"],  # used when strategy='constant'

        "model": [RandomForestRegressor(random_state=RANDOM_STATE)],
        "model__n_estimators": [300, 600],
        "model__max_depth": [None, 12, 20],
        "model__max_features": ["sqrt", "log2", 0.6, 1.0],
    },
]

search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=5,
    n_jobs=-1,
    return_train_score=True,
    verbose=1
)
search


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"[{'model': [RandomForestR...ndom_state=42)], 'model__max_depth': [None, 12, ...], 'model__max_features': ['sqrt', 'log2', ...], 'model__n_estimators': [300, 600], ...}]"
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,True

0,1,2
,method,'cap'
,k,1.5
,numeric_only,True

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True



> **Optional:** The full grid may take time. For a quick smoke test, reduce the grid sizes before running.


## Fit Grid Search & Evaluate on Holdout Test

In [12]:
# --- Run the search (may take several minutes depending on CPU/RAM) ---
search.fit(X_train, y_train)

print("Best Params:")
print(search.best_params_)
print("\nCV best score (neg RMSE):", search.best_score_)


Fitting 5 folds for each of 96 candidates, totalling 480 fits


Best Params:
{'model': RandomForestRegressor(random_state=42), 'model__max_depth': 20, 'model__max_features': 0.6, 'model__n_estimators': 600, 'outliers__method': 'cap', 'preprocess__cat__imputer__fill_value': 'missing', 'preprocess__cat__imputer__strategy': 'most_frequent'}

CV best score (neg RMSE): -25046.451065796853


In [15]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# --- Evaluate on test set ---
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

# RMSE = sqrt(MSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)

print("\nTest RMSE:", rmse)
print("Test R^2:", r2)



Test RMSE: 26073.79121213318
Test R^2: 0.9152056721041716
