In [1]:
import pandas as pd
import sklearn

# Show all columns when displaying .head() or .describe()
pd.set_option('max_columns', None)

In [2]:
# Split features and labels into X and y Data Frames
X = pd.read_csv("Kaggle_HousePrice_TrainingData.csv")
X_test = pd.read_csv("Kaggle_HousePrice_TestData.csv")

# Drop rows with missing SalesPrice AKA House price data, since they tell us nothing
X.dropna(axis = 0, subset = ['SalePrice'], inplace = True)
y = pd.DataFrame(X.SalePrice)
X.drop(['SalePrice'], axis = 1, inplace = True)

In [3]:
# Find missing values in Data Set
for columns in X:
    missing_vals = X[columns].isnull().sum()
    
    # If less than 70% of the data is available, drop that feature since imputating that feature column would likely
    # lead to very inaccurate results.
    if (missing_vals > 440):
        X.drop(columns, axis = 1, inplace = True)
        X_test.drop(columns, axis = 1, inplace = True)

# Find the columns with numerical data and the columns with categorical data
numerical = X.select_dtypes (exclude = 'object').columns.tolist()

# Find the categorical columns with a cardinality of less than 10, so One Hot Encoding doesn't result in
# too many extra values.
categorical = [col for col in X.columns if X[col].dtype == 'object' and X[col].nunique() < 10]

X = X [categorical + numerical]
X_test = X_test [categorical + numerical]

Initial Missing Columns/Data and how many missing values there are in that column include the following: 

['LotFrontage 259', 'Alley 1369', 'MasVnrType 8', 'MasVnrArea 8', 'BsmtQual 37', 'BsmtCond 37', 'BsmtExposure 38', 'BsmtFinType1 37', 'BsmtFinType2 38', 'Electrical 1', 'FireplaceQu 690', 'GarageType 81', 'GarageYrBlt 81', 'GarageFinish 81', 'GarageQual 81', 'GarageCond 81', 'PoolQC 1453', 'Fence 1179', 'MiscFeature 1406']

prior to data preprocessing. 

**If more than 30% of the data for that feature is missing, drop that column as imputation would most likely lead to inaccurate results (Fence, MiscFeature, PoolQC are dropped from training and test data sets).**

In [4]:
from sklearn.model_selection import train_test_split

# Split the data into training data and validation/test data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = .85, test_size = .15, random_state = 1)

Use sklearn's Pipeline to make preprocess the data concisely.

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Imputer for numerical data, imputes based on mean value of the non missing values in the column
num_imputer = SimpleImputer (strategy = 'mean')
# Imputer and label encoder Pipeline for categorical data, unknown values are ignored to avoid
# unique categorical values, which cause the OneHotEncoder to throw exceptions
cat_impute = Pipeline (steps=[
    ('impute', SimpleImputer (strategy = 'most_frequent')),
    ('encode', OneHotEncoder(handle_unknown = 'ignore'))
])
preprocess = ColumnTransformer (transformers = [('numeric', num_imputer, numerical), 
                                                ('categorical', cat_impute, categorical)])

In [6]:
from sklearn.ensemble import RandomForestRegressor

# Sets up the model as a RandomForestRegressor with 100 DecisionTrees in the forest 
model = RandomForestRegressor(n_estimators = 80, random_state = 1)

In [7]:
from sklearn.metrics import mean_absolute_error

# Finalizes the pipeline to clean the code before feeding it into the model
final_model = Pipeline (steps = [
    ('preprocessor', preprocess),
    ('model', model)
])

final_model.fit(X_train, y_train.values.ravel())
predictions = final_model.predict(X_valid)
mae = mean_absolute_error(predictions, y_valid)
print ("Model's mean absolute error:", mae)

Model's mean absolute error: 15494.560273972604


In [8]:
# # Submission to Kaggle Competition https://www.kaggle.com/c/home-data-for-ml-course/overview
# preds_test = final_model.predict(X_test)

# output = pd.DataFrame({'Id': X_test.Id,
#                        'SalePrice': preds_test})
# output.to_csv('submission.csv', index = False)