# House Prices (Kaggle competition)

The goal is to predict the sales price for each house. 

The metric used is the classical Root-Mean-Squared-Error (RMSE) between the log of predicted value and log of SalePrice.

In [2]:
# Import relevant modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [3]:
# Import the data

train_data = pd.read_csv(r"/Data/House Prices/train.csv")
test_data = pd.read_csv(r"/Data/House Prices/test.csv")

In [4]:
categorical_data = ["SaleType", "Neighborhood", "MSZoning","Street","Alley","LotShape","LandContour","Utilities",\
                    "LotConfig","LandSlope","Condition1","Condition2","BldgType","HouseStyle","RoofStyle","RoofMatl", \
                    "Exterior1st","Exterior2nd","MasVnrType","ExterQual","ExterCond","Foundation","BsmtQual",\
                   "BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","Heating","HeatingQC","CentralAir","Electrical",\
                    "KitchenQual","Functional","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond"\
                    ,"PavedDrive","PoolQC","Fence","MiscFeature","SaleCondition"]

numerical_data = ["GrLivArea", "GarageCars", "OverallQual", "FullBath", "YearBuilt", "MSSubClass", "LotFrontage", "LotArea", \
                  "OverallCond", "YearRemodAdd", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", \
                  "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "BsmtFullBath", "BsmtHalfBath", "HalfBath" , "BedroomAbvGr", \
                  "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageArea", "WoodDeckSF", "OpenPorchSF",\
                  "EnclosedPorch", "3SsnPorch" , "ScreenPorch" , "PoolArea" , "MiscVal" , "MoSold", "YrSold" ]          

train_prepare = train_data[numerical_data + categorical_data]
Y_train = train_data["SalePrice"]

In [5]:
# Transformation Pipelines

train_num = numerical_data
train_cat = categorical_data

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([("imputer", SimpleImputer(strategy="median")), ("minmax", MinMaxScaler())])
cat_pipeline = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(sparse=False))])

from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([("num", num_pipeline, numerical_data), ("cat", cat_pipeline, categorical_data)])

train_ready = full_pipeline.fit_transform(train_prepare)

X_train = pd.DataFrame(train_ready)

In [6]:
# Transform the test data

test_prepare = test_data[numerical_data + categorical_data]
X_test_array = full_pipeline.transform(test_prepare)

X_test = pd.DataFrame(X_test_array)
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,278,279,280,281,282,283,284,285,286,287
0,0.105878,0.25,0.444444,0.333333,0.644928,0.0,0.202055,0.048246,0.625,0.183333,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.187453,0.25,0.555556,0.333333,0.623188,0.0,0.205479,0.060609,0.625,0.133333,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.243971,0.5,0.444444,0.666667,0.905797,0.235294,0.181507,0.058566,0.5,0.8,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.239261,0.5,0.555556,0.666667,0.913043,0.235294,0.195205,0.040562,0.625,0.8,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.178222,0.5,0.777778,0.666667,0.869565,0.588235,0.075342,0.017318,0.5,0.7,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
# Try two quick models: Linear Regressor and Random Forest Regressor
# First: Linear Regressor

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, Y_train)

from sklearn.metrics import mean_squared_error

train_predictions = lin_reg.predict(X_train)
lin_mse = mean_squared_error(Y_train, train_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

20682.731640595728

In [8]:
# Second: Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

rand_tree = RandomForestRegressor()
rand_tree.fit(X_train, Y_train)

train_rand_pred = rand_tree.predict(X_train)
rand_mse = mean_squared_error(train_rand_pred, Y_train)
rand_rmse = np.sqrt(rand_mse)
rand_rmse



12951.73535693878

In [9]:
# RandomTreeRegressor probably overfit on training data: trying it using Cross-Validation

from sklearn.model_selection import cross_val_score

rand_scores = cross_val_score(rand_tree, X_train, Y_train, scoring="neg_mean_squared_error", cv=10)
rand_rmse_scores = np.sqrt(-rand_scores)

In [10]:
# Looking at the results

def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())
    
display_scores(rand_rmse_scores)

Scores:  [26874.83217506 31090.12247309 26142.32310405 37769.61851027
 31712.23452245 25929.85975437 28652.03860143 25993.70641793
 36606.4560845  28513.09204335]
Mean:  29928.42836864972
Standard deviation:  4115.504743402177


In [11]:
# Scores make more sense, close to Linear Regressor's score. 
# Let's compute the cross-validation scores for the Linear Regressor.

lin_scores = cross_val_score(lin_reg, X_train, Y_train, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

Scores:  [2.41421010e+04 3.52615876e+04 2.33147436e+04 4.16330963e+04
 3.05024623e+04 4.35294023e+04 2.41984774e+04 2.26561670e+04
 6.67739624e+04 1.75785632e+15]
Mean:  175785631634779.9
Standard deviation:  527356894800335.7


In [12]:
# XGBoost model

import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, Y_train)
xgb_pred = xgb_reg.predict(X_train)

xgb_mse = mean_squared_error(Y_train, xgb_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_rmse



  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


15138.40728774057

In [13]:
# XGBoost with cross-val

xgb_scores = cross_val_score(xgb_reg, X_train, Y_train, scoring="neg_mean_squared_error", cv=10)
xgb_rmse_scores = np.sqrt(-xgb_scores)
display_scores(xgb_rmse_scores)

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


Scores:  [21652.21522123 23893.36010646 20700.95167937 36129.55338415
 30079.60822278 25286.05537425 22488.80335979 21312.65540284
 29334.46823769 25173.55205649]
Mean:  25605.122304504715
Standard deviation:  4644.609360072036


In [14]:
# Best model for the moment with cross-val are:
# XGBoost: 30687
# RandomTree:34675
# LinearRegressor: 35449

# Before fine-tuning the models, let's try Linear Regression regularized (Elastic Net)

from sklearn.linear_model import ElasticNet

elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train, Y_train)
elastic_pred = elastic_net.predict(X_train)

elastic_mse = mean_squared_error(Y_train, elastic_pred)
elastic_rmse = np.sqrt(elastic_mse)
elastic_rmse

31897.836070947873

In [15]:
# ElasticNet with cross-val

elastic_scores = cross_val_score(elastic_net, X_train, Y_train, scoring="neg_mean_squared_error", cv=10)
elastic_rmse_scores = np.sqrt(-elastic_scores)
display_scores(elastic_rmse_scores)

Scores:  [23286.6879674  30700.46280817 29753.20617927 40547.39195789
 42652.82262046 32244.5408906  29631.91352524 25072.20524007
 50844.07708476 26645.19924852]
Mean:  33137.850752237035
Standard deviation:  8328.789155597337


In [16]:
# Elastic Net has from far the worst score (why?)
# Let's go on with XGBoost and RandomTreeRegressor and fine-tune the hyperparameters

from sklearn.model_selection import GridSearchCV

grid = [{"n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8]},\
              {"bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4]}]

grid_search = GridSearchCV(estimator=rand_tree, param_grid=grid,\
                                 scoring="neg_mean_squared_error", return_train_score=True)

grid_search.fit(X_train, Y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=10, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 6, 8],

In [17]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [18]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=8, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=30,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [19]:
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print (np.sqrt(- mean_score), params)

48582.214781735 {'max_features': 2, 'n_estimators': 3}
40639.0631178682 {'max_features': 2, 'n_estimators': 10}
36896.112142586826 {'max_features': 2, 'n_estimators': 30}
45188.91023533657 {'max_features': 4, 'n_estimators': 3}
38263.81446082497 {'max_features': 4, 'n_estimators': 10}
36753.24589367654 {'max_features': 4, 'n_estimators': 30}
43578.986548577996 {'max_features': 6, 'n_estimators': 3}
38036.046544820165 {'max_features': 6, 'n_estimators': 10}
35105.712278086496 {'max_features': 6, 'n_estimators': 30}
42572.59952811078 {'max_features': 8, 'n_estimators': 3}
34925.72741489794 {'max_features': 8, 'n_estimators': 10}
33218.04387938909 {'max_features': 8, 'n_estimators': 30}
43503.19354263185 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
37824.5055079756 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
46375.341552067606 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
38429.39866409757 {'bootstrap': False, 'max_features': 3, 'n_estimators': 1

In [20]:
# Score for fine-tuned RandomTreeRegressor: 31393 (from 34675)

# Let's now fine-tune XGBoost

params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}

xgb = xgboost.XGBRegressor(learning_rate= 0.1, n_estimators= 1000, seed=0, subsample= 0.8, colsample_bytree= 0.8) 
            

grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=params,\
                                 scoring="neg_mean_squared_error", return_train_score=True)


In [None]:
%timeit grid_search_xgb.fit(X_train, Y_train)



  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \








  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \








  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \








  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




In [None]:
cv_results_xgb = grid_search_xgb.cv_results_
for mean_score, params in zip(cv_results_xgb["mean_test_score"], cv_results_xgb["params"]):
    print (np.sqrt(- mean_score), params)

In [None]:
# Best score for XGBoost: 31227 (31393 for RandomTree)
# Let's select the necessary hyperparameters for XGBoost:

grid_search_xgb.best_estimator_

In [None]:
# Apply on test data

last_xgb = grid_search_xgb.best_estimator_

last_xgb.fit(X_train, Y_train)

last_scores = cross_val_score(last_xgb, X_train, Y_train, scoring="neg_mean_squared_error", cv=10)
last_rmse_scores = np.sqrt(-last_scores)
display_scores(last_rmse_scores)

In [None]:
# Predicting results

%timeit final_predictions = last_xgb.predict(X_test)