# This script is to make pipeline for housing data based on the book

## Download data

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
def load_housing_data(housing_path='datasets/housing'):
    """Loads Housing data into a pandas dataframe.
    
    # Arguments:
        housing_path: the path where `housing.csv` exists
    
    # Returns:
        data, pd.DataFrame: the housing data as a pandas dataframe
    """
    data_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(data_path)

In [3]:
housing = load_housing_data()

In [4]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


all attribute are numerical except ocean_proximity

## split train test set (stratified sampling)

In [5]:
# using median income to create strata
housing['income_cat'] = pd.cut(x=housing['median_income'], 
                               bins=[0, 1.5, 3, 4.5, 6, np.inf], 
                               labels=[1, 2, 3, 4, 5])

In [6]:
housing[['income_cat','median_income']].head()

Unnamed: 0,income_cat,median_income
0,5,8.3252
1,5,8.3014
2,5,7.2574
3,4,5.6431
4,3,3.8462


In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

In [8]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X=housing, y=housing['income_cat']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

Now that we have a test set that is representative of income_cat's distribution, it's time to remove it:

In [9]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop('income_cat', axis=1, inplace=True)

## Prepare data for traing ML

In [10]:
housing = strat_train_set.drop('median_house_value', axis = 1)
housing_labels = strat_train_set['median_house_value'].copy()
housing.shape, housing_labels.shape

((16512, 9), (16512,))

### Data cleaning
missing value

In [11]:
housing.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        158
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

### Full pipeline for preparing data for housing numerical & cat columns

In [12]:
housing_num = housing.drop('ocean_proximity', axis = 1)

In [13]:
num_attribs = list(housing_num)

In [14]:
num_attribs

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [15]:
cat_attribs = ["ocean_proximity"]

In [16]:
from sklearn.impute import SimpleImputer
from sklearn_features.transformers import DataFrameSelector
from sklearn.impute import SimpleImputer

In [27]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import LabelBinarizer

In [18]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

In [19]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room 
    def fit(self, X, y=None):
        return self # nothing else to do 
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix] 
        population_per_household = X[:, population_ix] / X[:, household_ix] 
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [28]:
# https://stackoverflow.com/questions/46162855/fit-transform-takes-2-positional-arguments-but-3-were-given-with-labelbinarize
class LabelBinarizerPipelineFriendly(LabelBinarizer):
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(LabelBinarizerPipelineFriendly, self).fit(X)
    def transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).transform(X)

    def fit_transform(self, X, y=None):
        return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [30]:
num_pipeline = Pipeline([
             ('selector', DataFrameSelector(num_attribs)),
             ('imputer', SimpleImputer(strategy="median")),
             ('attribs_adder', CombinedAttributesAdder()),
             ('std_scaler', StandardScaler()),
])

In [31]:
cat_pipeline = Pipeline([
             ('selector', DataFrameSelector(cat_attribs)),
             ('label_binarizer', LabelBinarizerPipelineFriendly()),
])

In [32]:
full_pipeline = FeatureUnion(transformer_list=[
             ("num_pipeline", num_pipeline),
             ("cat_pipeline", cat_pipeline),
])

In [33]:
housing_prepared = full_pipeline.fit_transform(housing)

In [34]:
housing_prepared

array([[-1.15604281,  0.77194962,  0.74333089, ...,  0.        ,
         0.        ,  0.        ],
       [-1.17602483,  0.6596948 , -1.1653172 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18684903, -1.34218285,  0.18664186, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.58648943, -0.72478134, -1.56295222, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78221312, -0.85106801,  0.18664186, ...,  0.        ,
         0.        ,  0.        ],
       [-1.43579109,  0.99645926,  1.85670895, ...,  0.        ,
         1.        ,  0.        ]])

## Training model


### linear regression

In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [37]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

In [39]:
housing_predictions = lin_reg.predict(housing_prepared)

In [40]:
lin_mse = mean_squared_error(housing_labels, housing_predictions)

In [41]:
lin_rmse = np.sqrt(lin_mse)

In [42]:
lin_rmse

68628.19819848923

linear regression model underfitting

### a DecisionTreeRegressor.

In [44]:
from sklearn.tree import DecisionTreeRegressor

In [45]:
tree_reg = DecisionTreeRegressor()

In [46]:
tree_reg.fit(housing_prepared, housing_labels)

DecisionTreeRegressor()

In [47]:
housing_predictions = tree_reg.predict(housing_prepared)

In [48]:
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)

0.0


Decisiontreeregressor overfitting

## Cross validation

In [49]:
from sklearn.model_selection import cross_val_score

In [51]:
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring = 'neg_mean_squared_error', cv = 10
                        )
rmse_scores = np.sqrt(-scores)

In [50]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [53]:
display_scores(rmse_scores)

Scores: [68977.63371514 67942.85538678 69924.95909514 69294.28255861
 71085.96248336 73763.55750418 70112.62978003 71986.39925913
 76128.78842896 70535.68769553]
Mean: 70975.2755906875
Standard deviation: 2311.2826795412125


In [57]:
# linear regression cross validation
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring='neg_mean_squared_error', cv = 10
                            )
lin_rmse_scores = np.sqrt(-lin_scores)

In [58]:
display_scores(lin_rmse_scores)

Scores: [66782.73843989 66960.118071   70347.95244419 74739.57052552
 68031.13388938 71193.84183426 64969.63056405 68281.61137997
 71552.91566558 67665.10082067]
Mean: 69052.46136345083
Standard deviation: 2731.674001798342


### RandomForestRegressor

In [59]:
from sklearn.ensemble import RandomForestRegressor

In [62]:
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)
forest_housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, forest_housing_predictions)
forest_rmse = np.sqrt(forest_mse)
print(forest_rmse)

18747.66560686349


In [63]:
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                             scoring='neg_mean_squared_error', cv = 10
                            )
forest_rmse_scores = np.sqrt(-forest_scores)

In [65]:
display_scores(forest_rmse_scores)

Scores: [48997.72182019 47420.10710868 49671.71721341 52207.5246772
 49720.89426857 53606.20448678 48646.95283014 48141.2460102
 52946.255842   49957.40695671]
Mean: 50131.60312138742
Standard deviation: 1987.931119414355
