In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os

In [2]:
dataPath = "datasets/housing"
dataFileName = "housing.csv"
def loadData(dataPath,dataFileName):
    fullPath = os.path.join(dataPath,dataFileName)
    return pd.read_csv(fullPath)

housing = loadData(dataPath,dataFileName)

housing['income_cat'] = np.ceil(housing['median_income']/1.5)
housing['income_cat'].where(housing['income_cat']<5,5,inplace = True)

housing_labels = housing["median_house_value"].copy()
housing = housing.drop("median_house_value", axis=1)

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

def train_test_stratSplit(data,labels):
    split = StratifiedShuffleSplit(n_splits = 1,train_size = 0.8,random_state = 42)
    for trainIdx,testIdx in split.split(data,data['income_cat']):
        trainDataX = data.loc[trainIdx]
        testDataX = data.loc[testIdx]
        trainDataY = labels.loc[trainIdx]
        testDataY = labels.loc[testIdx]
    return (trainDataX,trainDataY,testDataX,testDataY)

trainDataX,trainDataY,testDataX,testDataY = train_test_stratSplit(housing,housing_labels)

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin,BaseEstimator

In [5]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class AttributesAdder:
    def __init__(self):
        pass
    def fit(self,data,y=None):
        return self
    def transform(self,data,y=None):
        Rooms_Per_Household = data[:,rooms_ix]/data[:,household_ix]
        Population_Per_Household = data[:,population_ix]/data[:,household_ix]
        Bedrooms_Per_Household = data[:,bedrooms_ix]/data[:,household_ix]
        return np.c_[data, Rooms_Per_Household, Population_Per_Household,Bedrooms_Per_Household]
    
class DataFrameSelector:
    def __init__(self,AttributeNames):
        self.AttributeNames = AttributeNames
    def fit(self,data,y=None):
        return self
    def transform(self,data,y=None):
        return data[self.AttributeNames].values
        


In [6]:
numericalAttribute = list(trainDataX)
numericalAttribute.remove("ocean_proximity")
categoricalAttribute = ["ocean_proximity"]

numerical_pipeline = Pipeline([('selector',DataFrameSelector(numericalAttribute)),
                                ('imputer',SimpleImputer(strategy = "median")),
                                ('Attribute_Adder',AttributesAdder()),
                                ('std_scaler',StandardScaler()),
                              ])

categorical_pipeline = Pipeline([('selector',DataFrameSelector(categoricalAttribute)),
                                 ('one_hot_encoder', OneHotEncoder(sparse=False)),
                                ])

full_pipeline = FeatureUnion(transformer_list=[("numerical_pipeline",numerical_pipeline),
                                               ("categorical_pipeline",categorical_pipeline),
                                              ])

In [7]:
trainDataX = full_pipeline.fit_transform(trainDataX)
testDataX = full_pipeline.fit_transform(testDataX)

In [8]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()

from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()

In [59]:
from sklearn.model_selection import cross_val_score

def docrossValidation(model,Predictors,Response):
    scores = cross_val_score(model, Predictors , Response,scoring="neg_mean_squared_error", cv=10)
    rmse_score = np.sqrt(-scores)
    np.set_printoptions(precision=2)
    print("Scores:",rmse_score)
    print("Mean: {0:<12.4f}".format(rmse_score.mean()))
    print("Standard deviation: {0:<12.4f}".format(rmse_score.std()))

In [60]:
docrossValidation(lin_reg,trainDataX,trainDataY)

Scores: [67616.07 66969.59 69440.54 74277.33 68221.69 71588.82 65078.11 68282.65
 71976.63 68180.86]
Mean: 69163.2295  
Standard deviation: 2579.7387   


In [61]:
docrossValidation(tree_reg,trainDataX,trainDataY)

Scores: [70470.23 69276.53 70346.06 72612.01 68426.37 75170.97 73178.29 70085.78
 76856.56 67916.82]
Mean: 71433.9619  
Standard deviation: 2789.2775   


In [62]:
docrossValidation(forest_reg,trainDataX,trainDataY)

Scores: [49746.07 47685.14 49653.81 52707.03 49646.09 53630.24 49124.66 47933.03
 53390.4  51000.36]
Mean: 50451.6814  
Standard deviation: 2041.2927   
