In [15]:
# Try Linear Regression, etc.

# First, resurrect the data.
import pandas as pd
import numpy as np
datapath="/Users/jasonmiller/Source/MachineLearning/datasets/housing/housing.csv"
all_data=pd.read_csv(datapath)
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(all_data,test_size=0.2,random_state=42)
train_predictors = train_set.drop(["median_house_value"],axis=1)
train_labels = train_set["median_house_value"].copy()
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6  # hard coded index
class AddFeatures (BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self   # required by base class
    def transform(self,X,y=None):
        rooms_per_household = X[:,rooms_ix]/X[:,households_ix]
        population_per_household = X[:,population_ix]/X[:,households_ix]
        bedrooms_per_room = X[:,bedrooms_ix]/X[:,rooms_ix]
        # numpy shorthand for a column-wise concatenation
        return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
numeric_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('feater_adder',AddFeatures()),
    ('scaler',StandardScaler())
])
categoric_features=['ocean_proximity']
numeric_features = list(train_predictors)
numeric_features.remove(categoric_features[0])
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
full_pipeline = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),  # dense matrix
    ("cat", OneHotEncoder(), categoric_features)  # sparse matrix
])
#
prepared_train_predictors = full_pipeline.fit_transform(train_predictors)

In [17]:
# Try predicting price for first five data values. 
# Note this is the training set -- values already seen. It should do well.
some_labels = train_labels[:5]
some_labels

14196    103000.0
8267     382100.0
17445    172600.0
14265     93400.0
2271      96500.0
Name: median_house_value, dtype: float64

In [26]:
# First model: Linear Regression
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(prepared_train_predictors, train_labels)
some_data = train_predictors[:5]
some_data_prepared = full_pipeline.transform(some_data)  # do not call fit_transform()
some_predictions = model.predict(some_data_prepared)
some_predictions

array([181746.54359616, 290558.74973505, 244957.50017771, 146498.51061398,
       163230.42393939])

In [27]:
# Error per data value is over $50K.
some_labels-some_predictions

14196   -78746.543596
8267     91541.250265
17445   -72357.500178
14265   -53098.510614
2271    -66730.423939
Name: median_house_value, dtype: float64

In [28]:
# More sophisticated: MSE on entire training set.
# Same result: error per data value is over $50K.
# Book suggests adding more features like log(population)
from sklearn.metrics import mean_squared_error
lin_predictions = model.predict(prepared_train_predictors)
lin_mse = mean_squared_error(train_labels,lin_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

67593.20745775253

In [30]:
# Second model: Decison Tree.
# Zero error indicates overfitting.
# Need to do cross-validation (next step).
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(prepared_train_predictors, train_labels)
dtree_predictions = model.predict(prepared_train_predictors)
dtree_mse = mean_squared_error(train_labels,dtree_predictions)
dtree_rmse = np.sqrt(dtree_mse)
dtree_rmse

0.0

In [33]:
# Third model: Random Forest
# This ran for a long time (one minute?).
# Best so far.
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(prepared_train_predictors, train_labels)
rf_predictions = model.predict(prepared_train_predictors)
rf_mse = mean_squared_error(train_labels,rf_predictions)
rf_rmse = np.sqrt(rf_mse)
rf_rmse

18445.568303023643