In [1]:
# Build a data transformation pipeline.
# Plan to deploy on train_set and test_set.
# Incorporate lessons from previous exploratory steps.
# Use sklearn modules that provide the glue.

# First, load the data.
import pandas as pd
datapath="/Users/jasonmiller/Source/MachineLearning/datasets/housing/housing.csv"
all_data=pd.read_csv(datapath)
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(all_data,test_size=0.2,random_state=42)
train_predictors = train_set.drop(["median_house_value"],axis=1)
train_labels = train_set["median_house_value"].copy()
train_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,INLAND


In [23]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6  # hard coded index
# Subclass must have fit() and transform().
class AddFeatures (BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self   # required by base class
    def transform(self,X,y=None):
        rooms_per_household = X[:,rooms_ix]/X[:,households_ix]
        population_per_household = X[:,population_ix]/X[:,households_ix]
        bedrooms_per_room = X[:,bedrooms_ix]/X[:,rooms_ix]
        # numpy shorthand for a column-wise concatenation
        return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
# Instance has fit_transform() method.
# It calls fit() then transform() on each pipeline component in order.
numeric_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('feater_adder',AddFeatures()),
    ('scaler',StandardScaler())
])
# Get ready to treat numeric and categoric features differently.
categoric_features=['ocean_proximity']
numeric_features = list(train_set)
numeric_features.remove(categoric_features[0])
# Use a meta-pipeline.
# It applies each pipeline to each column.
# It integrates mismatched return types into sparse or dense array depending on density.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
full_pipeline = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),  # dense matrix
    ("cat", OneHotEncoder(), categoric_features)  # sparse matrix
])
# Run it
prepared_test_set = full_pipeline.fit_transform(test_set)
prepared_test_set   # this is a numpy array

array([[ 0.25541734,  0.22194113, -0.30073951, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02976613, -0.20947715,  0.098724  , ...,  0.        ,
         0.        ,  0.        ],
       [-1.46454628,  1.03788441,  1.85636346, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.2689819 ,  0.80810728, -0.30073951, ...,  0.        ,
         0.        ,  0.        ],
       [-0.120668  ,  0.5548835 ,  0.57808022, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.57634349, -0.64089543, -0.93988113, ...,  0.        ,
         0.        ,  0.        ]])

In [24]:
# Done!
# Ready to train a model.