In [53]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import Imputer


In [59]:
train_data_all = pd.read_csv("housing_data_TRAIN.csv")
train_data=train_data_all.select_dtypes(exclude=['object'])


In [60]:
#X_train.info()
cols_with_null = [col for col in train_data.isnull().any().index if train_data.isnull().any()[col]]


In [75]:
train_imputed = train_data.copy()
train_dropnulls = train_data.drop(cols_with_null, axis=1)
my_imputer = Imputer()
train_imputed = pd.DataFrame(my_imputer.fit_transform(train_imputed),index=train_data.index, columns=train_data.columns)

train_imputed_with_missing_column = train_data.copy()
for col in cols_with_null:
    train_imputed_with_missing_column[col + "_was_missing"] = train_imputed_with_missing_column[col].isnull()
    
train_imputed_with_missing_column = pd.DataFrame(my_imputer.fit_transform(train_imputed_with_missing_column), \
                                                            index=train_imputed_with_missing_column.index, columns=train_imputed_with_missing_column.columns)


In [87]:
data_to_use = train_dropnulls
X_train, X_test, y_train, y_test = train_test_split(data_to_use.drop("SalePrice", axis=1), data_to_use['SalePrice'], test_size=0.33, random_state=42)
rf_regressor = RandomForestRegressor()
# Fit with dropnulls
rf_regressor.fit(X_train, y_train)
predicted = rf_regressor.predict(X_test)
mae_dropnulls = mean_absolute_error(y_test, predicted)
mae_dropnulls

19395.922614107887

In [88]:
data_to_use = train_imputed
X_train, X_test, y_train, y_test = train_test_split(data_to_use.drop("SalePrice", axis=1), data_to_use['SalePrice'], test_size=0.33, random_state=42)
rf_regressor = RandomForestRegressor()
# Fit with dropnulls
rf_regressor.fit(X_train, y_train)
predicted = rf_regressor.predict(X_test)
mae_imputed = mean_absolute_error(y_test, predicted)
mae_imputed

18751.551452282158

In [89]:
data_to_use = train_imputed_with_missing_column
X_train, X_test, y_train, y_test = train_test_split(data_to_use.drop("SalePrice", axis=1), data_to_use['SalePrice'], test_size=0.33, random_state=42)
rf_regressor = RandomForestRegressor()
# Fit with dropnulls
rf_regressor.fit(X_train, y_train)
predicted = rf_regressor.predict(X_test)
mae_imputed_with_was_missing = mean_absolute_error(y_test, predicted)
mae_imputed_with_was_missing

18989.852904564315

In [90]:
print("MAE dropping columns with nulls: ", mae_dropnulls)
print("MAE imputing values for fields with nulls: ", mae_imputed)
print("MAE with imputed values and _was_null column: ", mae_imputed_with_was_missing)


MAE dropping columns with nulls:  19395.9226141
MAE imputing values for fields with nulls:  18751.5514523
MAE with imputed values and _was_null column:  18989.8529046


In [92]:
X_train.sample(10)


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,LotFrontage_was_missing,MasVnrArea_was_missing,GarageYrBlt_was_missing
1329,1330.0,60.0,63.0,9084.0,7.0,5.0,1998.0,1998.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2006.0,0.0,0.0,0.0
242,243.0,50.0,63.0,5000.0,5.0,4.0,1900.0,1950.0,0.0,0.0,...,77.0,0.0,0.0,0.0,0.0,4.0,2006.0,0.0,0.0,0.0
1313,1314.0,60.0,108.0,14774.0,9.0,5.0,1999.0,1999.0,165.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,2010.0,0.0,0.0,0.0
805,806.0,20.0,91.0,12274.0,7.0,5.0,2008.0,2008.0,256.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2008.0,0.0,0.0,0.0
580,581.0,20.0,70.049958,14585.0,6.0,6.0,1960.0,1987.0,85.0,594.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2007.0,1.0,0.0,0.0
974,975.0,70.0,60.0,11414.0,7.0,8.0,1910.0,1993.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,2009.0,0.0,0.0,0.0
1211,1212.0,50.0,152.0,12134.0,8.0,7.0,1988.0,2005.0,0.0,427.0,...,0.0,0.0,0.0,0.0,0.0,6.0,2010.0,0.0,0.0,0.0
1202,1203.0,50.0,50.0,6000.0,5.0,8.0,1925.0,1997.0,0.0,0.0,...,208.0,0.0,0.0,0.0,0.0,5.0,2009.0,0.0,0.0,0.0
1135,1136.0,30.0,60.0,6180.0,6.0,5.0,1926.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,0.0,0.0,0.0
1227,1228.0,20.0,72.0,8872.0,5.0,8.0,1965.0,2008.0,300.0,595.0,...,0.0,0.0,0.0,0.0,0.0,12.0,2008.0,0.0,0.0,0.0


In [95]:
np.asarray([1,"a",3.0])

array(['1', 'a', '3.0'],
      dtype='<U11')

In [None]:
np.