In [2]:
import os 
import tarfile
from six.moves import urllib
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression 
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer



In [3]:
DOWNLOAD_ROOT= "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [4]:
fetch_housing_data()

In [5]:
def load_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)

In [6]:
X = load_data()

In [7]:
X_label= X["median_house_value"]
X.drop("median_house_value", axis =1, inplace=True)
X_num = X.drop('ocean_proximity', axis = 1)
X_cat = X['ocean_proximity']


In [8]:
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)
    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        return self.encoder.transform(x)

In [62]:
rooms_ix, bedrooms_ix, population_ix, household_ix = [
    list(X.columns).index(col)
    for col in ("total_rooms", "total_bedrooms", "population", "households")]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kwargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        


In [50]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values
    

In [116]:
num_attributes = list(X_num)
cat_attributes = ['ocean_proximity']

num_pipeline= Pipeline([
    ('selector', DataFrameSelector(num_attributes)),
    ('imputer', SimpleImputer(strategy = 'median')), 
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler())
])

cat_pipeline= Pipeline([
    ('selector', DataFrameSelector(cat_attributes)),
    ('encoder',  MyLabelBinarizer())
])

full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipeline",num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [117]:
X_prep = full_pipeline.fit_transform(X)

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X_prep, X_label
                                                   , test_size = 0.15)

In [119]:
def tree(X_train,X_test,y_train,y_test):
    tree = DecisionTreeRegressor()
    tree.fit(X_train, y_train)
    tree_predictions = tree.predict(X_test)
    tree_mse = mean_squared_error(tree_predictions, y_test)
    tree_rmse = np.sqrt(tree_mse)
    return tree_rmse
tree(X_train, X_test, y_train, y_test)

66297.19022862091

In [120]:
def lin_reg(X_train,X_test,y_train,y_test):
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    Y_pred = lin_reg.predict(X_test)
    lin_mse = mean_squared_error(Y_pred, y_test)
    lin_rmse = np.sqrt(lin_mse)
    return lin_rmse
lin_reg(X_train, X_test, y_train, y_test)

69306.86203007892