# Housing Prices

In [None]:
# Imports and load
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

iowa_file_path = '../input/train.csv'
home_data = pd.read_csv(iowa_file_path)

test_data_path = '../input/test.csv'
test_data = pd.read_csv(test_data_path)

# print(home_data.columns)
# home_data.describe(include = 'all')

In [None]:
# Keep only intuitive features
features = ['MSSubClass', 'MSZoning', 'Neighborhood', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'ExterQual', 'ExterCond', 'BsmtQual',
       'BsmtCond', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'FullBath','HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'GarageArea', 'GarageQual',
       'GarageCond']
X = home_data[features].copy()
y = home_data.SalePrice.copy()
test_X = test_data[features].copy()

X = X.dropna()
y = y[y.index.isin(X.index)]

Before one-hot encoding, we must join train and test data and call `get_dummies()` then. After that, we split it again.

src: https://stackoverflow.com/questions/44026832/valueerror-number-of-features-of-the-model-must-match-the-input

In [None]:
one_hot_encoded_training_predictors = pd.get_dummies(X)
one_hot_encoded_test_predictors = pd.get_dummies(test_X)
X, test_X = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors, join='inner', axis=1)

In [None]:
# Find missing values
missing_val_count_by_column = (X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

missing_val_count_by_column = (X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

In [None]:
# Find best features using a correlation matrix
correlation_dataframe = X.copy()

corr = correlation_dataframe.corr()
corr.style.background_gradient()
corr.style.background_gradient().set_precision(2)

In [None]:
# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42)

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(train_X, train_y)
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

In [None]:
# To improve accuracy, create a new Random Forest model which you will train on all training data
rf_model_on_full_data = RandomForestRegressor(random_state=42)

# fit rf_model_on_full_data on all data from the 
rf_model_on_full_data.fit(X, y)

In [None]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
test_X = my_imputer.fit_transform(test_X)

# make predictions which we will submit. 
test_preds = rf_model_on_full_data.predict(test_X)

# The lines below shows you how to save your data in the format needed to score it in the competition
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})

output.to_csv('submission.csv', index=False)