# MATM063: Principles of Data Science, Python LAB 

## Worksheet 4 (Week 4)

### MATM063: Preparing data for ML & Regression models

#### Question 1:
Here, you should combine the material we discussed in the lectures into one running Python script that reads in the housing file, creates a test and
training set, transforms the data such that it can be used in a machine learning algorithm, and then trains a linear regression model.
Please proceed as follows to create such script:

1. Start you script with the part that reads in the housing data file from the internet https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz.
2. Load the housing.csv file using the fetch_housing_data() function.
3. Split your data into test and training data by using Stratified Shuffle Split (20% test data) while assuring that median_income is proportionally
represented in the test data.
4. Separate the target median_house_value from the predictors (similarly to housing and housing_labels as done in the lectures). Then, define
numerical attributes and categorical attributes (in the lectures we called them num_attributs and cat_attributs).
5. Write a full pipeline that fits and transforms housing to housing_prepared that can be used to train ML models. Make sure that you can change
the option add_bedrooms_per_room from True to False as we will need this at the next step.
6. Define and train a linear regression model (as done in the lectures). Output both, the model’s RMSE for the training dataset and also the cross
validation scores (10-folds). Compare two realization of these models, a) one where you set Standard Scaler (you should
have very similar results as in the lecture notes); and b) one with add_bedrooms_per_room = False
7. Test also the performance of these two models (for the default customer transformer setting) when comparing min-max scaling vs standardisation.
8. Finally test (for the default customer transformer setting and standardisation) what happens when you use the Ordinal Encoder rather then the
OneHotEncoder.

In [5]:
import os
import tarfile
import urllib
import pandas as pd
import numpy as np

#1. Reading in the housing data file from the internet.

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path=os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path=os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)

#2. Loading the housing.cvs file using the fetch_housing_data() function.

# execute these functions:
fetch_housing_data() # fetch the data
housing = load_housing_data() #loading the data

#3. Split data into test and training data using Stratified Shuffle Split.

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0, 2, 3, 4, 5, 6, np.inf], #note if values in median income are below bin level, we get NaN
                               labels=[0, 3, 6, 9, 12, 15])

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=41)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# remove income_cat to bring data back to original state:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

#4. Separate target median_house_value from the predictors, then define numerical and categorical attributes

housing = strat_train_set.drop("median_house_value", axis=1)
# drop() creates a copy of the data and does not affect strat_train_set
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1) # drop category attributes:
num_attributs = list(housing_num) # give list of column names
cat_attributs = ['ocean_proximity'] # define category names

#5. Show the full 'Transformation Pipeline'.

#import necessary packages first
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

std_scale = StandardScaler()
housing_std = std_scale.fit_transform(housing_num)

#we do it again... as we did in part 4
housing = strat_train_set.drop("median_house_value", axis=1) # separate target from predictors
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1) # drop category attributes:

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 # indices for attribute columns
class CombinedAttributesAdder (BaseEstimator, TransformerMixin): # set up customer transformers:
    def __init__ (self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('attribs_adder',CombinedAttributesAdder(add_bedrooms_per_room=True)),
                         ('std_scaler', StandardScaler()),])

num_attributs = list(housing_num) # give list of column names
cat_attributs = ['ocean_proximity'] # define category names
full_pipeline = ColumnTransformer([('num', num_pipeline, num_attributs),
                                   ('cat', OneHotEncoder(), cat_attributs),])

housing_prepared = full_pipeline.fit_transform(housing) # full transformation as numpy array

#6. Define and train a linear regression model, output both the model's RMSE
#for the training data set and also the cross validations scores (10-fold)
#compare two realization of these models, one for add_bedrooms_per_room = TRUE, one for FALSE

# we load a linear regression model:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression() # create instance
lin_reg.fit(housing_prepared, housing_labels) # train the model

#Evaluating the RMSE (Root-mean-square-error)
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_rmse = np.sqrt(mean_squared_error(housing_labels,housing_predictions))
print(f'The RMSE error for add_bedrooms_per_room = TRUE is',lin_rmse)

#Evaluating the k-fold Cross-validation (where k = 10)
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print ("Scores for add_bedrooms_per_room = TRUE:", scores)
    print ("Mean for add_bedrooms_per_room = TRUE:", scores.mean())
    print ("Standard deviation for add_bedrooms_per_room = TRUE:", scores.std())

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores.round())

#NOW REPEAT FOR add_bedrooms_per_room = FALSE!!

housing = strat_train_set.drop("median_house_value", axis=1) # separate target from predictors
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1) # drop category attributes:

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 # indices for attribute columns
class CombinedAttributesAdder (BaseEstimator, TransformerMixin): # set up customer transformers:
    def __init__ (self, add_bedrooms_per_room=False): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('attribs_adder',CombinedAttributesAdder(add_bedrooms_per_room=True)),
                         ('std_scaler', StandardScaler()),])

num_attributs = list(housing_num) # give list of column names
cat_attributs = ['ocean_proximity'] # define category names
full_pipeline = ColumnTransformer([('num', num_pipeline, num_attributs),
                                   ('cat', OneHotEncoder(), cat_attributs),])

housing_prepared = full_pipeline.fit_transform(housing) # full transformation as numpy array

# we load a linear regression model:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression() # create instance
lin_reg.fit(housing_prepared, housing_labels) # train the model

#Evaluating the RMSE (Root-mean-square-error)
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_rmse = np.sqrt(mean_squared_error(housing_labels,housing_predictions))
print(f'The RMSE error for add_bedrooms_per_room = FALSE is',lin_rmse)

#Evaluating the k-fold Cross-validation (where k = 10)
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print ("Scores for add_bedrooms_per_room = FALSE:", scores)
    print ("Mean for add_bedrooms_per_room = FALSE:", scores.mean())
    print ("Standard deviation for add_bedrooms_per_room = FALSE:", scores.std())

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores.round())

#7. Test performance of these two models when comparing min-max scaling vs standardisation
#(for the default customer transformer setting, keep setting to TRUE I suppose?)

housing = strat_train_set.drop("median_house_value", axis=1) # separate target from predictors
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1) # drop category attributes:

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 # indices for attribute columns
class CombinedAttributesAdder (BaseEstimator, TransformerMixin): # set up customer transformers:
    def __init__ (self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('attribs_adder',CombinedAttributesAdder(add_bedrooms_per_room=True)),
                         ('std_scaler', StandardScaler()),])

num_attributs = list(housing_num) # give list of column names
cat_attributs = ['ocean_proximity'] # define category names
full_pipeline = ColumnTransformer([('num', num_pipeline, num_attributs),
                                   ('cat', OneHotEncoder(), cat_attributs),])

housing_prepared = full_pipeline.fit_transform(housing) # full transformation as numpy array

# we load a linear regression model:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression() # create instance
lin_reg.fit(housing_prepared, housing_labels) # train the model

#Evaluating the RMSE (Root-mean-square-error)
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_rmse = np.sqrt(mean_squared_error(housing_labels,housing_predictions))
print(f'The RMSE error for Standard Scalar is',lin_rmse)

#Evaluating the k-fold Cross-validation (where k = 10)
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print ("Scores for Standard Scalar:", scores)
    print ("Mean for Standard Scalar:", scores.mean())
    print ("Standard deviation for Standard Scalar:", scores.std())

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores.round())

#Now do it for MinMaxScalar

from sklearn.preprocessing import MinMaxScaler

housing = strat_train_set.drop("median_house_value", axis=1) # separate target from predictors
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1) # drop category attributes:

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 # indices for attribute columns
class CombinedAttributesAdder (BaseEstimator, TransformerMixin): # set up customer transformers:
    def __init__ (self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('attribs_adder',CombinedAttributesAdder(add_bedrooms_per_room=True)),
                         ('std_scaler', MinMaxScaler()),])

num_attributs = list(housing_num) # give list of column names
cat_attributs = ['ocean_proximity'] # define category names
full_pipeline = ColumnTransformer([('num', num_pipeline, num_attributs),
                                   ('cat', OneHotEncoder(), cat_attributs),])

housing_prepared = full_pipeline.fit_transform(housing) # full transformation as numpy array

# we load a linear regression model:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression() # create instance
lin_reg.fit(housing_prepared, housing_labels) # train the model

#Evaluating the RMSE (Root-mean-square-error)
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_rmse = np.sqrt(mean_squared_error(housing_labels,housing_predictions))
print(f'The RMSE error for MinMax Scalar is',lin_rmse)

#Evaluating the k-fold Cross-validation (where k = 10)
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print ("Scores for MinMax Scalar:", scores)
    print ("Mean for MinMax Scalar:", scores.mean())
    print ("Standard deviation for MinMax Scalar:", scores.std())

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores.round())

#8. Finally test (for the default customer transformer setting and standardisation) what happens when you use the Ordinal Encoder rather then the
# OneHotEncoder.

housing = strat_train_set.drop("median_house_value", axis=1) # separate target from predictors
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1) # drop category attributes:

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 # indices for attribute columns
class CombinedAttributesAdder (BaseEstimator, TransformerMixin): # set up customer transformers:
    def __init__ (self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('attribs_adder',CombinedAttributesAdder(add_bedrooms_per_room=True)),
                         ('std_scaler', StandardScaler()),])

num_attributs = list(housing_num) # give list of column names
cat_attributs = ['ocean_proximity'] # define category names
full_pipeline = ColumnTransformer([('num', num_pipeline, num_attributs),
                                   ('cat', OneHotEncoder(), cat_attributs),])

housing_prepared = full_pipeline.fit_transform(housing) # full transformation as numpy array

# we load a linear regression model:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression() # create instance
lin_reg.fit(housing_prepared, housing_labels) # train the model

#Evaluating the RMSE (Root-mean-square-error)
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_rmse = np.sqrt(mean_squared_error(housing_labels,housing_predictions))
print(f'The RMSE error for OneHotEncoder is',lin_rmse)

#Evaluating the k-fold Cross-validation (where k = 10)
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print ("Scores for OneHotEncoder:", scores)
    print ("Mean for OneHotEncoder:", scores.mean())
    print ("Standard deviation for OneHotEncoder:", scores.std())

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores.round())

#Now we do it for the Ordinal Encoder

from sklearn.preprocessing import OrdinalEncoder

housing = strat_train_set.drop("median_house_value", axis=1) # separate target from predictors
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1) # drop category attributes:

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 # indices for attribute columns
class CombinedAttributesAdder (BaseEstimator, TransformerMixin): # set up customer transformers:
    def __init__ (self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('attribs_adder',CombinedAttributesAdder(add_bedrooms_per_room=True)),
                         ('std_scaler', StandardScaler()),])

num_attributs = list(housing_num) # give list of column names
cat_attributs = ['ocean_proximity'] # define category names
full_pipeline = ColumnTransformer([('num', num_pipeline, num_attributs),
                                   ('cat', OrdinalEncoder(), cat_attributs),])

housing_prepared = full_pipeline.fit_transform(housing) # full transformation as numpy array

# we load a linear regression model:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression() # create instance
lin_reg.fit(housing_prepared, housing_labels) # train the model

#Evaluating the RMSE (Root-mean-square-error)
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_rmse = np.sqrt(mean_squared_error(housing_labels,housing_predictions))
print(f'The RMSE error for OrdinalEncoder is',lin_rmse)

#Evaluating the k-fold Cross-validation (where k = 10)
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print ("Scores for OrdinalEncoder:", scores)
    print ("Mean for OrdinalEncoder:", scores.mean())
    print ("Standard deviation for OrdinalEncoder:", scores.std())

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores.round())

#FINALLY WE SEE DIFFERENCE WITH THE ORDINAL ENCODER THANK GOD!

The RMSE error for add_bedrooms_per_room = TRUE is 67730.8424139431
Scores for add_bedrooms_per_room = TRUE: [63662. 69869. 68162. 68362. 66650. 66548. 69559. 73129. 68120. 64996.]
Mean for add_bedrooms_per_room = TRUE: 67905.7
Standard deviation for add_bedrooms_per_room = TRUE: 2531.523456340075
The RMSE error for add_bedrooms_per_room = FALSE is 67730.8424139431
Scores for add_bedrooms_per_room = FALSE: [63662. 69869. 68162. 68362. 66650. 66548. 69559. 73129. 68120. 64996.]
Mean for add_bedrooms_per_room = FALSE: 67905.7
Standard deviation for add_bedrooms_per_room = FALSE: 2531.523456340075
The RMSE error for Standard Scalar is 67730.8424139431
Scores for Standard Scalar: [63662. 69869. 68162. 68362. 66650. 66548. 69559. 73129. 68120. 64996.]
Mean for Standard Scalar: 67905.7
Standard deviation for Standard Scalar: 2531.523456340075
The RMSE error for MinMax Scalar is 67730.8424139431
Scores for MinMax Scalar: [63662. 69869. 68162. 68362. 66650. 66548. 69559. 73129. 68120. 64996.]


#### Question 2

1. Train a Random Forest Regressor on the prepared housing dataset (Steps 1-5 of Q1).

In [7]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

2. Make a ’no-CV’ prediction (no cross-validation) with this model for the entire training set and output the RMSE.

In [None]:
# We can evalute this trained model on the training set:
housing_predictions = tree_reg.predict(housing_prepared)
tree_rmse = np.sqrt(mean_squared_error(housing_labels, housing_predictions))
tree_rmse #the output is zero

np.float64(0.0)

3. Now, make a prediction using a 5-fold cross-validation and output: Scores, Mean, Standard deviation.

In [10]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=5) #5-fold
tree_rmse_scores = np.sqrt(-scores)

display_scores(tree_rmse_scores.round())

Scores for OrdinalEncoder: [71343. 71914. 73658. 74415. 69267.]
Mean for OrdinalEncoder: 72119.4
Standard deviation for OrdinalEncoder: 1812.0839494901995


4. Compare the RMSE of the no-CV with the CV predictions, and deduce from these values if your model has underfit or overfit the training data?

Score on training set much lower that on evaluation set: DT is
strongly overfitting the training set! 


Reasons for models underfitting training data:

-model not powerful enough

-features in data do not provide enough information

-constraints on model (e.g. via regularization terms) to reduce numbers of parameters

Reasons for overfitting (and potential remedies):

-model to complex (too many parameters) for the information provided (maybe regularize it or take simpler one)

-model has been trained too extensively (early stopping)

-too little data (if possible use more training data)



#### Question 3:

1. Update the customer transformer (Listing 1) such that it has two hyper-parameters: add_bedrooms_per_room = True as well as
add_rooms_per_household = True.

2. Update the pipeline from Q1 such that it uses this new transformer.

3. Try out different combinations of adding or omitting these combined attributes to explore if their presents might improve the performance of the
linear regression model.

I do the changes from part 1 and 2, part 3 requires 4 combinations, TRUE TRUE, TRUE FALSE, FALSE TRUE and FALSE FALSE, to fully explore the space. Make sure to use the linear regression model.

In [12]:
#5. Show the full 'Transformation Pipeline'.

#import necessary packages first
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

std_scale = StandardScaler()
housing_std = std_scale.fit_transform(housing_num)

#we do it again... as we did in part 4
housing = strat_train_set.drop("median_house_value", axis=1) # separate target from predictors
housing_labels = strat_train_set["median_house_value"].copy()
housing_num = housing.drop("ocean_proximity", axis=1) # drop category attributes:

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6 # indices for attribute columns
class CombinedAttributesAdder (BaseEstimator, TransformerMixin): # set up customer transformers:
    def __init__ (self, add_bedrooms_per_room=True, add_rooms_per_household = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
        self.add_rooms_per_household = add_rooms_per_household
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')),
                         ('attribs_adder',CombinedAttributesAdder(add_bedrooms_per_room=True, add_rooms_per_household=True)),
                         ('std_scaler', StandardScaler()),])

num_attributs = list(housing_num) # give list of column names
cat_attributs = ['ocean_proximity'] # define category names
full_pipeline = ColumnTransformer([('num', num_pipeline, num_attributs),
                                   ('cat', OneHotEncoder(), cat_attributs),])

housing_prepared = full_pipeline.fit_transform(housing) # full transformation as numpy array

# we load a linear regression model:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression() # create instance
lin_reg.fit(housing_prepared, housing_labels) # train the model

#Evaluating the RMSE (Root-mean-square-error)
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_rmse = np.sqrt(mean_squared_error(housing_labels,housing_predictions))
print(f'The RMSE error for add_bedrooms_per_room = TRUE, add_rooms_per_household = TRUE is',lin_rmse)

#Evaluating the k-fold Cross-validation (where k = 10)
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print ("Scores for add_bedrooms_per_room = TRUE, add_rooms_per_household = TRUE:", scores)
    print ("Mean for add_bedrooms_per_room = TRUE, add_rooms_per_household = TRUE:", scores.mean())
    print ("Standard deviation for add_bedrooms_per_room = TRUE, add_rooms_per_household = TRUE:", scores.std())

lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores.round())

The RMSE error for add_bedrooms_per_room = TRUE, add_rooms_per_household = TRUE is 67730.8424139431
Scores for add_bedrooms_per_room = TRUE, add_rooms_per_household = TRUE: [63662. 69869. 68162. 68362. 66650. 66548. 69559. 73129. 68120. 64996.]
Mean for add_bedrooms_per_room = TRUE, add_rooms_per_household = TRUE: 67905.7
Standard deviation for add_bedrooms_per_room = TRUE, add_rooms_per_household = TRUE: 2531.523456340075
