In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import mean_squared_error

In [2]:
housing = pd.read_csv('datasets/housing/housing.csv')
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1)

In [3]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6


class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [4]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names].values

In [5]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder(sparse=False)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
])

housing_prepared = full_pipeline.fit_transform(housing)

In [6]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

In [7]:
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

68629.05691965688

In [8]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.0

In [9]:
def display_scores(scores):
    for score in scores:
        print("Each Scores:      : ", score)
    print("Mean              : ", scores.mean())
    print("Standard deviation: ", scores.std())

In [10]:
from sklearn.model_selection import cross_val_score

tree_scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                              scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-tree_scores)
display_scores(tree_rmse_scores)

Each Scores:      :  69529.41702703432
Each Scores:      :  67576.48628295558
Each Scores:      :  71414.54129897541
Each Scores:      :  69246.2579495247
Each Scores:      :  71129.49764630357
Each Scores:      :  73666.24323781049
Each Scores:      :  70512.22417429947
Each Scores:      :  70023.10487198769
Each Scores:      :  76008.6526230758
Each Scores:      :  70006.28925051155
Mean              :  70911.27143624784
Standard deviation:  2268.1208671983863


In [11]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Each Scores:      :  66760.10662190618
Each Scores:      :  66953.21094238694
Each Scores:      :  70347.95244419386
Each Scores:      :  74754.56306734771
Each Scores:      :  68023.22687859628
Each Scores:      :  71193.84183425654
Each Scores:      :  64988.34185575058
Each Scores:      :  68272.01595744333
Each Scores:      :  71554.594404356
Each Scores:      :  67665.52898253564
Mean              :  69051.33829887731
Standard deviation:  2735.127757424825


In [12]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10)
forest_reg.fit(housing_prepared, housing_labels)

rf_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                            scoring="neg_mean_squared_error", cv=10)
rf_rmse_scores = np.sqrt(-rf_scores)
display_scores(rf_rmse_scores)

Each Scores:      :  51386.73463563584
Each Scores:      :  50424.324640560415
Each Scores:      :  51347.92456634857
Each Scores:      :  54321.751532944785
Each Scores:      :  52233.11782995264
Each Scores:      :  56281.955972484466
Each Scores:      :  52382.220757720206
Each Scores:      :  50392.18502689807
Each Scores:      :  54717.68640349735
Each Scores:      :  52929.12607375279
Mean              :  52641.70274397952
Standard deviation:  1841.9148432555282


In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]
forest_reg = RandomForestRegressor(n_estimators=10)
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(n_estimators=10),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [14]:
grid_search.best_score_

-2513377643.5562963

In [15]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [16]:
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30)

In [18]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

65554.17308391069 {'max_features': 2, 'n_estimators': 3}
55465.18251464602 {'max_features': 2, 'n_estimators': 10}
52740.670508714036 {'max_features': 2, 'n_estimators': 30}
60923.960381163626 {'max_features': 4, 'n_estimators': 3}
52891.96568922673 {'max_features': 4, 'n_estimators': 10}
50557.26634779562 {'max_features': 4, 'n_estimators': 30}
59180.42068097211 {'max_features': 6, 'n_estimators': 3}
52588.211779651334 {'max_features': 6, 'n_estimators': 10}
50202.99523716494 {'max_features': 6, 'n_estimators': 30}
58915.93674009808 {'max_features': 8, 'n_estimators': 3}
52131.86015105884 {'max_features': 8, 'n_estimators': 10}
50133.597951436685 {'max_features': 8, 'n_estimators': 30}
62095.39185453878 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
55096.943275207974 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59460.82106735757 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52645.462042689476 {'bootstrap': False, 'max_features': 3, 'n_estimator

In [19]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([6.91102255e-02, 6.14211491e-02, 4.34469888e-02, 1.58510854e-02,
       1.52739494e-02, 1.56290074e-02, 1.38542730e-02, 3.74171730e-01,
       5.19696740e-02, 1.10379897e-01, 6.38894037e-02, 8.37337109e-03,
       1.48147207e-01, 5.65078762e-05, 4.00533585e-03, 4.42019505e-03])

In [26]:
extra_attribs = ["room per hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_one_hot_attribs = ["<1H OCEAN", "NEAR OCEAN", "NEAR BAY", "INLAND", "ISLAND"]
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.37417173004481713, 'median_income'),
 (0.14814720650205404, 'NEAR OCEAN'),
 (0.11037989709806174, 'pop_per_hhold'),
 (0.06911022549585734, 'longitude'),
 (0.06388940373187556, 'bedrooms_per_room'),
 (0.061421149143125194, 'latitude'),
 (0.05196967403580019, 'room per hhold'),
 (0.043446988827336844, 'housing_median_age'),
 (0.01585108543322403, 'total_rooms'),
 (0.015629007400663905, 'population'),
 (0.015273949437776423, 'total_bedrooms'),
 (0.013854272986007388, 'households'),
 (0.008373371085848118, '<1H OCEAN'),
 (0.004420195050377698, 'ISLAND'),
 (0.004005335850996368, 'INLAND'),
 (5.6507876178005386e-05, 'NEAR BAY')]