In [228]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from xgboost import XGBRegressor

from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

%config InlineBackend.figure_format = 'retina'
from pylab import rcParams
rcParams['figure.figsize'] = (9, 6)

In [229]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [230]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [231]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-

In [232]:
train.drop(["Street", "Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature", 'LotFrontage', 'MasVnrArea', 'GarageYrBlt'], axis=1, inplace=True)
test.drop(["Street", "Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature", 'LotFrontage', 'MasVnrArea', 'GarageYrBlt'], axis=1, inplace=True)

In [235]:
y_train = train.SalePrice
train.drop('SalePrice', axis=1, inplace=True)

In [238]:
train['is_test'] = 0
test['is_test'] = 1

In [239]:
df = pd.concat([train, test])

In [240]:
df1 = df

In [241]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Data columns (total 72 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2915 non-null object
LotArea          2919 non-null int64
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2918 non-null object
Exterior2nd      2918 non-null object
MasVnrType       2895 non-null object
ExterQual        2919 non-

In [242]:
columns_to_transform = ['MSZoning', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 
       'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']

In [243]:
df_dummies = pd.get_dummies(df1, columns = columns_to_transform)

In [244]:
df_dummies.isnull().sum()

Id                       0
MSSubClass               0
LotArea                  0
OverallQual              0
OverallCond              0
YearBuilt                0
YearRemodAdd             0
BsmtFinSF1               1
BsmtFinSF2               1
BsmtUnfSF                1
TotalBsmtSF              1
1stFlrSF                 0
2ndFlrSF                 0
LowQualFinSF             0
GrLivArea                0
BsmtFullBath             2
BsmtHalfBath             2
FullBath                 0
HalfBath                 0
BedroomAbvGr             0
KitchenAbvGr             0
TotRmsAbvGrd             0
Fireplaces               0
GarageCars               1
GarageArea               1
WoodDeckSF               0
OpenPorchSF              0
EnclosedPorch            0
3SsnPorch                0
ScreenPorch              0
                        ..
GarageFinish_RFn         0
GarageFinish_Unf         0
GarageQual_Ex            0
GarageQual_Fa            0
GarageQual_Gd            0
GarageQual_Po            0
G

In [245]:
df_dummies.head(10)

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,8450,7,5,2003,2003,706.0,0.0,150.0,...,0,0,0,1,0,0,0,0,1,0
1,2,20,9600,6,8,1976,1976,978.0,0.0,284.0,...,0,0,0,1,0,0,0,0,1,0
2,3,60,11250,7,5,2001,2002,486.0,0.0,434.0,...,0,0,0,1,0,0,0,0,1,0
3,4,70,9550,7,5,1915,1970,216.0,0.0,540.0,...,0,0,0,1,1,0,0,0,0,0
4,5,60,14260,8,5,2000,2000,655.0,0.0,490.0,...,0,0,0,1,0,0,0,0,1,0
5,6,50,14115,5,5,1993,1995,732.0,0.0,64.0,...,0,0,0,1,0,0,0,0,1,0
6,7,20,10084,8,5,2004,2005,1369.0,0.0,317.0,...,0,0,0,1,0,0,0,0,1,0
7,8,60,10382,7,6,1973,1973,859.0,32.0,216.0,...,0,0,0,1,0,0,0,0,1,0
8,9,50,6120,7,5,1931,1950,0.0,0.0,952.0,...,0,0,0,1,1,0,0,0,0,0
9,10,190,7420,5,6,1939,1950,851.0,0.0,140.0,...,0,0,0,1,0,0,0,0,1,0


In [246]:
X_train = df_dummies[df_dummies.is_test==0].drop('is_test', axis=1)

In [247]:
X_test = df_dummies[df_dummies.is_test==1].drop('is_test', axis=1)

In [248]:
columns = X_train.columns

In [252]:
X_train.head(20)

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,8450,7,5,2003,2003,706.0,0.0,150.0,...,0,0,0,1,0,0,0,0,1,0
1,2,20,9600,6,8,1976,1976,978.0,0.0,284.0,...,0,0,0,1,0,0,0,0,1,0
2,3,60,11250,7,5,2001,2002,486.0,0.0,434.0,...,0,0,0,1,0,0,0,0,1,0
3,4,70,9550,7,5,1915,1970,216.0,0.0,540.0,...,0,0,0,1,1,0,0,0,0,0
4,5,60,14260,8,5,2000,2000,655.0,0.0,490.0,...,0,0,0,1,0,0,0,0,1,0
5,6,50,14115,5,5,1993,1995,732.0,0.0,64.0,...,0,0,0,1,0,0,0,0,1,0
6,7,20,10084,8,5,2004,2005,1369.0,0.0,317.0,...,0,0,0,1,0,0,0,0,1,0
7,8,60,10382,7,6,1973,1973,859.0,32.0,216.0,...,0,0,0,1,0,0,0,0,1,0
8,9,50,6120,7,5,1931,1950,0.0,0.0,952.0,...,0,0,0,1,1,0,0,0,0,0
9,10,190,7420,5,6,1939,1950,851.0,0.0,140.0,...,0,0,0,1,0,0,0,0,1,0


In [217]:
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)

In [253]:
imputer.fit(X_train)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [254]:
X_train_imputed = imputer.transform(X_train)

In [258]:
X_train_imputed = pd.DataFrame(X_train_imputed, columns=columns)

In [260]:
X_train_imputed.head(20)

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1.0,60.0,8450.0,7.0,5.0,2003.0,2003.0,706.0,0.0,150.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2.0,20.0,9600.0,6.0,8.0,1976.0,1976.0,978.0,0.0,284.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.0,60.0,11250.0,7.0,5.0,2001.0,2002.0,486.0,0.0,434.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4.0,70.0,9550.0,7.0,5.0,1915.0,1970.0,216.0,0.0,540.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5.0,60.0,14260.0,8.0,5.0,2000.0,2000.0,655.0,0.0,490.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,6.0,50.0,14115.0,5.0,5.0,1993.0,1995.0,732.0,0.0,64.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
6,7.0,20.0,10084.0,8.0,5.0,2004.0,2005.0,1369.0,0.0,317.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7,8.0,60.0,10382.0,7.0,6.0,1973.0,1973.0,859.0,32.0,216.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,9.0,50.0,6120.0,7.0,5.0,1931.0,1950.0,0.0,0.0,952.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
9,10.0,190.0,7420.0,5.0,6.0,1939.0,1950.0,851.0,0.0,140.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [265]:
from sklearn.preprocessing import StandardScaler

In [266]:
scaler = StandardScaler()

In [267]:
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [271]:
X_train_imputed_scaled = scaler.transform(X_train)
X_train_imputed_scaled = pd.DataFrame(X_train_imputed_scaled, columns=columns)

In [273]:
X_test_imputed_scaled = scaler.transform(imputer.transform(X_test))

In [274]:
X_train_fin, X_val, y_train_fin, y_val = train_test_split(X_train_imputed_scaled, y_train, test_size=0.2)

In [275]:
X_train_fin.shape

(1168, 266)

In [276]:
X_val.shape

(292, 266)

In [277]:
X_test_scaled.shape

(1108, 260)

In [278]:
y_train_fin.shape

(1168,)

In [279]:
y_val.shape

(292,)

In [282]:
pipeline_grid_search_cv = Pipeline([('imputer', Imputer()),
                                    ('xgbrg', XGBRegressor())])

In [283]:
params = {"xgbrg__n_estimators": [25, 50, 100, 150, 200, 300],
         "xgbrg__learning_rate": [0.001, 0.01, 0.1, 1.0]} 

In [284]:
from sklearn.model_selection import GridSearchCV

In [285]:
searchCV = GridSearchCV(pipeline_grid_search_cv, params, n_jobs=1, cv=5, refit=True)

In [286]:
searchCV.fit(X_train_fin, y_train_fin)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('xgbrg', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_dep...       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'xgbrg__n_estimators': [25, 50, 100, 150, 200, 300], 'xgbrg__learning_rate': [0.001, 0.01, 0.1, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [287]:
sorted(searchCV.grid_scores_, key = lambda x: -x.mean_validation_score)



[mean: 0.87492, std: 0.03770, params: {'xgbrg__learning_rate': 0.1, 'xgbrg__n_estimators': 300},
 mean: 0.87306, std: 0.03870, params: {'xgbrg__learning_rate': 0.1, 'xgbrg__n_estimators': 200},
 mean: 0.86993, std: 0.04008, params: {'xgbrg__learning_rate': 0.1, 'xgbrg__n_estimators': 150},
 mean: 0.86597, std: 0.04099, params: {'xgbrg__learning_rate': 0.1, 'xgbrg__n_estimators': 100},
 mean: 0.85172, std: 0.04313, params: {'xgbrg__learning_rate': 0.1, 'xgbrg__n_estimators': 50},
 mean: 0.81448, std: 0.04822, params: {'xgbrg__learning_rate': 0.01, 'xgbrg__n_estimators': 300},
 mean: 0.78577, std: 0.04617, params: {'xgbrg__learning_rate': 0.1, 'xgbrg__n_estimators': 25},
 mean: 0.73292, std: 0.06085, params: {'xgbrg__learning_rate': 1.0, 'xgbrg__n_estimators': 50},
 mean: 0.73136, std: 0.06115, params: {'xgbrg__learning_rate': 1.0, 'xgbrg__n_estimators': 200},
 mean: 0.73136, std: 0.06151, params: {'xgbrg__learning_rate': 1.0, 'xgbrg__n_estimators': 100},
 mean: 0.73132, std: 0.06113, pa

In [288]:
final_pipeline = searchCV.best_estimator_

In [289]:
mae_grid_search_cv = cross_val_score(final_pipeline, X_train_fin, y_train_fin, scoring='neg_mean_absolute_error')

In [290]:
print('CV Mean Absolute Error with GridSearchCV: %2f' %(-1 * mae_grid_search_cv.mean()))

CV Mean Absolute Error with GridSearchCV: 17328.820579


In [292]:
predicted_prices = final_pipeline.predict(X_test_imputed_scaled)

In [293]:
print(predicted_prices)

[124463.55 163719.83 177176.86 ... 143030.97 115504.   231934.27]


In [322]:
ID = list(test.Id)

In [323]:
my_submission = pd.DataFrame({'Id': ID, 'SalePrice': predicted_prices})

In [324]:
my_submission.to_csv('submission.csv', index=False)