In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import os

In [2]:
dataPath = "datasets/housing"
dataFileName = "housing.csv"
def loadData(dataPath,dataFileName):
    fullPath = os.path.join(dataPath,dataFileName)
    return pd.read_csv(fullPath)

housing = loadData(dataPath,dataFileName)

housing['income_cat'] = np.ceil(housing['median_income']/1.5)
housing['income_cat'].where(housing['income_cat']<5,5,inplace = True)

housing_labels = housing["median_house_value"].copy()
housing = housing.drop("median_house_value", axis=1)

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

def train_test_stratSplit(data,labels):
    split = StratifiedShuffleSplit(n_splits = 1,train_size = 0.8,random_state = 42)
    for trainIdx,testIdx in split.split(data,data['income_cat']):
        trainDataX = data.loc[trainIdx]
        testDataX = data.loc[testIdx]
        trainDataY = labels.loc[trainIdx]
        testDataY = labels.loc[testIdx]
    return (trainDataX,trainDataY,testDataX,testDataY)

trainDataX,trainDataY,testDataX,testDataY = train_test_stratSplit(housing,housing_labels)

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin,BaseEstimator

In [5]:
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class AttributesAdder:
    def __init__(self):
        pass
    def fit(self,data,y=None):
        return self
    def transform(self,data,y=None):
        Rooms_Per_Household = data[:,rooms_ix]/data[:,household_ix]
        Population_Per_Household = data[:,population_ix]/data[:,household_ix]
        Bedrooms_Per_Household = data[:,bedrooms_ix]/data[:,household_ix]
        return np.c_[data, Rooms_Per_Household, Population_Per_Household,Bedrooms_Per_Household]
    
class DataFrameSelector:
    def __init__(self,AttributeNames):
        self.AttributeNames = AttributeNames
    def fit(self,data,y=None):
        return self
    def transform(self,data,y=None):
        return data[self.AttributeNames].values
        


In [6]:
numericalAttribute = list(trainDataX)
numericalAttribute.remove("ocean_proximity")
categoricalAttribute = ["ocean_proximity"]

numerical_pipeline = Pipeline([('selector',DataFrameSelector(numericalAttribute)),
                                ('imputer',SimpleImputer(strategy = "median")),
                                ('Attribute_Adder',AttributesAdder()),
                                ('std_scaler',StandardScaler()),
                              ])

categorical_pipeline = Pipeline([('selector',DataFrameSelector(categoricalAttribute)),
                                 ('one_hot_encoder', OneHotEncoder(sparse=False)),
                                ])

full_pipeline = FeatureUnion(transformer_list=[("numerical_pipeline",numerical_pipeline),
                                               ("categorical_pipeline",categorical_pipeline),
                                              ])

In [7]:
trainDataX = full_pipeline.fit_transform(trainDataX)
testDataX = full_pipeline.fit_transform(testDataX)

In [8]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()

from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()

In [19]:
from sklearn.model_selection import cross_val_score

def docrossValidation(model,Predictors,Response):
    scores = cross_val_score(model, Predictors , Response,scoring="neg_mean_squared_error", cv=10)
    rmse_score = np.sqrt(-scores)
    print("Scores:", rmse_score)
    print("Mean:", rmse_score.mean())
    print("Standard deviation:", rmse_score.std())

In [20]:
docrossValidation(lin_reg,trainDataX,trainDataY)

Scores: [67616.06866576 66969.59255376 69440.54457916 74277.32547784
 68221.6922074  71588.81580869 65078.10578944 68282.65383268
 71976.63230477 68180.86367448]
Mean: 69163.22948939742
Standard deviation: 2579.738739298033


In [21]:
docrossValidation(tree_reg,trainDataX,trainDataY)

Scores: [70359.2510772  67884.58341669 69727.33172417 73357.70736495
 68769.14278451 75991.74189439 72373.15555997 71293.80608859
 77655.87586433 67608.90553028]
Mean: 71502.15013050966
Standard deviation: 3200.6186149320074


In [18]:
docrossValidation(forest_reg,trainDataX,trainDataY)

rmse_scores: [49561.2004537  47274.7253045  49981.09437336 52530.05363138
 49594.37547891 53653.28455076 49230.32160164 47486.73816085
 53361.50395595 50525.73204233]
Scores: [49561.2004537  47274.7253045  49981.09437336 52530.05363138
 49594.37547891 53653.28455076 49230.32160164 47486.73816085
 53361.50395595 50525.73204233]
Mean: 50319.90295533822
Standard deviation: 2121.17489132927
