# House Prices - Processing the data

Based on the work with the EDA and Feature refinement notebooks, start to create a clean dataset that can be used for training a model.

In [1]:
import os
import pandas as pd
import numpy as np

PROJECT_PATH = "/".join(os.getcwd().split('/')[:-1])
DATA_PATH = os.path.join(PROJECT_PATH, 'data')
TRAIN_DATA_PATH = os.path.join(DATA_PATH, 'train.csv')

# Load the training dataset
house_prices_train = pd.read_csv(TRAIN_DATA_PATH)
house_prices_train = house_prices_train.drop('Id', axis=1)
house_prices_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## Defining the columns

Getting the columns that were considered the best ones to use from the EDA notebook

In [2]:
numeric_features = [
    'LotFrontage',
    'LotArea', 'YearBuilt',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF', 
    'GrLivArea', 
    'GarageYrBlt', 
    'GarageArea', 
    'WoodDeckSF', 
    'OpenPorchSF', 
    'EnclosedPorch', 
    'ScreenPorch', 
]

ordinal_features = [
    'GarageCond',
    'GarageQual',
    'FireplaceQu',
    'KitchenQual',
    'HeatingQC',
    'BsmtFinType2',
    'BsmtFinType1',
    'BsmtExposure',
    'BsmtCond',
    'BsmtQual',
    'ExterCond',
    'ExterQual'
]

categorical_features = [
    'MSZoning',
    'Street',
    'LotShape',
    'LandContour',
    'Utilities',
    'LotConfig',
    'LandSlope',
    'Neighborhood',
    'Condition1',
    'Condition2',
    'BldgType',
    'HouseStyle', 
    'RoofStyle',
    'RoofMatl',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Foundation',
    'Heating',
    'CentralAir',
    'Electrical',
    'Functional',
    'GarageType',
    'GarageFinish',
    'PavedDrive',
    'SaleType',
    'SaleCondition'
]

Bringing in the processing pipeline class from the EDA notebook

In [3]:
from sklearn.base import TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, FeatureUnion

class FeatureExtractor(TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.cols]

class HouseDataProcessor(object):
    def __init__(self, numeric_cols, ordinal_cols, categorical_cols):
        self.numeric_cols = numeric_cols
        self.ordinal_cols = ordinal_cols
        self.categorical_cols = categorical_cols
        
        self.numeric_pipeline_v1 = Pipeline([
            ('extractor', FeatureExtractor(self.numeric_cols)),
            ('impute', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])

        self.ordinal_pipeline_v1 = Pipeline([
            ('extractor', FeatureExtractor(self.ordinal_cols)),
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('ordinal', OrdinalEncoder())
        ])

        self.categorical_pipeline_v1 = Pipeline([
            ('extractor', FeatureExtractor(self.categorical_cols)),
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('one_hot', OneHotEncoder())
        ])

        self.processing_pipeline_v1 = FeatureUnion([
            ('numeric', self.numeric_pipeline_v1),
            ('ordinal', self.ordinal_pipeline_v1),
            ('categorical', self.categorical_pipeline_v1)
        ])
        
    def fit(self, X):
        return self.processing_pipeline_v1.fit(X)
    
    def transform(self, X):
        return self.processing_pipeline_v1.transform(X)
    
    def fit_transform(self, X):
        return self.processing_pipeline_v1.fit_transform(X)

## Remove Outlier SalesPrice

Remove the extreme values from the Sale Price so it doesn't skrew the results when making a prediction

In [4]:
sale_price_data = house_prices_train['SalePrice']

q1 = sale_price_data.quantile(0.25)
q3 = sale_price_data.quantile(0.75)
iqr = (q3 - q1)

# Extract the outliers sale price from the data
outliers = house_prices_train[(sale_price_data < (q1 - 1.5 * iqr)) | (sale_price_data > (q3 + 1.5 * iqr))]
outliers.shape

(61, 80)

In [5]:
# Get only the indexes that don't appear in the outlier list
inlier_index = ~house_prices_train.index.isin(outliers.index)

# Remove the outliers from the data 
# So we are left with data that doesn't have extreme outliers
training_data = house_prices_train[inlier_index]
training_data.shape

(1399, 80)

In [6]:
TARGET_FEATURE = 'SalePrice'

# Split into X and y
X = training_data.drop(TARGET_FEATURE, axis=1)
y = training_data[TARGET_FEATURE]

## Creating a training, validation & testing dataset

In [7]:
from sklearn.model_selection import train_test_split

# Set out initial random state
# This will be used by anything that can have its random stat defined
RANDOM_STATE = 11

# Sizes of each dataset
train_size = 0.7
val_size = 0.2
test_size = 0.1

# Creating Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE, test_size=test_size)

# From the training set, split into trianing and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=RANDOM_STATE, test_size=val_size)

print("Train Size: {:6}".format(X_train.shape[0]))
print("Val Size: {:7}".format(X_val.shape[0]))
print("Test Size: {:6}".format(X_test.shape[0]))

Train Size:   1007
Val Size:     252
Test Size:    140


In [8]:
# TODO - Run the data processing pipeline 
processor = HouseDataProcessor(numeric_features, ordinal_features, categorical_features)
X_train_processed = processor.fit_transform(X_train)

## Modelling

In [12]:
from sklearn.ensemble import RandomForestRegressor

# Train a random forest regressor
rand_clf_v1 = RandomForestRegressor(random_state=RANDOM_STATE)
rand_clf_v1.fit(X_train_processed, y_train)

RandomForestRegressor(random_state=11)

In [15]:
from sklearn.metrics import mean_squared_error

# Seeing a baseline with a basic random forest
y_pred = rand_clf_v1.predict(X_train_processed)

mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print('Random Forest V1 RMSE: ', rmse)

Random Forest V1 RMSE:  8610.330891854


For  baseline score, being around \$8000 in terms of error isn't to bad considered we are dealing with houses that are in the hundreds of thousands. I will now try to improve this result.