In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('./data_sets'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Extending column and row size
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(linewidth=140)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

train_df = pd.read_csv('./data_sets/kaggle_house_train.csv')
test_df = pd.read_csv('./data_sets/kaggle_house_test.csv')

# Saving the Id column for later
train_id=train_df.Id
test_id=test_df.Id

y = train_df.SalePrice

# Check columns with most null entries and drop them
train_df.isnull().sum()[train_df.isnull().sum()>0].sort_values(ascending=False)
train_df.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu'], axis=1, inplace=True)
test_df.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu'], axis=1, inplace=True)

# Contiuous and Categorical Variables
cont_var = train_df.select_dtypes(include=[int, float]).columns
cat_var = train_df.select_dtypes(include=object).columns
cont_var = cont_var.drop('SalePrice')

# Fill missing values for categorical variables
train_df[cont_var] = train_df[cont_var].fillna(train_df[cont_var].mean())
train_df[cat_var] = train_df[cat_var].ffill()


test_df[cont_var] = test_df[cont_var].fillna(test_df[cont_var].mean())
test_df[cat_var] = test_df[cat_var].ffill()

# Get dummies
X = pd.get_dummies(train_df[cat_var])
X[cont_var] = train_df[cont_var]
test_X = pd.get_dummies(test_df[cat_var])
test_X[cont_var] = test_df[cont_var]

# Create missing columns in test_df from train_df
xoCol = np.setxor1d(X.columns, test_X.columns)
test_X[xoCol] = X[xoCol].ffill()
print(test_X.head())

# Sort Columns
X = X.reindex(sorted(X.columns), axis=1)
test_X = test_X.reindex(sorted(test_X.columns), axis=1)
X.head()

# train_test_split on train data for validation
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

# GridSearch to find the best n_estimators and max_depth
params_to_test = {
    'n_estimators':[10,25,50,100],
    'learning_rate':[0.0001, 0.001, 0.01, 0.1, 1.0],
    'max_depth':[3,5,7]
}

# GradientBoostingRegressor model
gbr_model = GradientBoostingRegressor(random_state=42)
grid_search = GridSearchCV(gbr_model, param_grid=params_to_test, cv=5, scoring='neg_root_mean_squared_error', n_jobs=4) # Scoring is root mean squared error
grid_search.fit(train_X, train_y)

# Define our model to be the best estimator in the grid search
model = grid_search.best_estimator_

print(grid_search.best_params_)

val_pred = model.predict(val_X)
# Root mean squared error, lower is better
mean_squared_error(np.log(val_pred), np.log(val_y), squared=False)

# Visualization of SalePrice to SalePrediction
sale_predict = model.predict(X)
output = pd.DataFrame({'Id' : train_id, 'SalePrice' : train_df.SalePrice, 'SalePredict' : sale_predict})
output.head(20)

# predict test_df
test_predict = model.predict(test_X)
output = pd.DataFrame({'Id' : test_id, 'SalePrice' : test_predict})
output.head(20)

# Output to csv
output.to_csv('submission.csv', index=False)