In [1]:
import pandas as pd
import numpy as np

In [2]:
melbourne_file_path = 'melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path) 

In [4]:
# Count how many missing numbers there are in each column
missing_val_count_by_column = (melbourne_data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Car               62
BuildingArea    6450
YearBuilt       5375
CouncilArea     1369
dtype: int64


In [5]:
# In previous file we simply dropped the rows with missing values.

In [6]:
# This was not a very good solution as we lose data. 
# Can instead us imputation, where missing data is filled in with the mean of that column. Or extended imputation.
# Compare these approaches here.
melbourne_data = pd.read_csv(melbourne_file_path) 

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
from sklearn.tree import DecisionTreeRegressor

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [10]:
melb_target = melbourne_data.Price
melb_predictors = melbourne_data.drop(['Price'], axis=1)

In [11]:
# For the sake of keeping it simple, we'll use only numeric predictors. 
melb_numeric_predictors = melb_predictors.select_dtypes(exclude=['object'])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(melb_numeric_predictors, 
                                                    melb_target,
                                                    train_size=0.7, 
                                                    test_size=0.3, 
                                                    random_state=0)

In [13]:
def score_dataset(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_absolute_error(y_test, preds)

In [16]:
# 1) Get model Score from dropping columns with missing values

In [17]:
cols_with_missing = [col for col in X_train.columns 
                                 if X_train[col].isnull().any()]
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test  = X_test.drop(cols_with_missing, axis=1)
print("Mean Absolute Error from dropping columns with Missing Values:")
print(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test))

Mean Absolute Error from dropping columns with Missing Values:
189141.74432245875


In [18]:
# 2) Get Model Score from Imputation

In [19]:
from sklearn.preprocessing import Imputer

my_imputer = Imputer()
imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)
print("Mean Absolute Error from Imputation:")
print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test))

Mean Absolute Error from Imputation:
184784.94668747223


In [20]:
# Get Score from Imputation with Extra Columns Showing What Was Imputed

In [21]:
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing = (col for col in X_train.columns 
                                 if X_train[col].isnull().any())
for col in cols_with_missing:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()

# Imputation
my_imputer = Imputer()
imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)

print("Mean Absolute Error from Imputation while Track What Was Imputed:")
print(score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))

Mean Absolute Error from Imputation while Track What Was Imputed:
183277.4903429413


In [25]:
# Conclusion
# As is common, imputing missing values allowed us to improve our model compared to dropping those columns. 
# We got an additional boost by tracking what values had been imputed.