In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
#set the location of the directory with data
directory = 'C:\\Users\\shenbaga.kumar\\Documents\\Kaggle\\HousePrices\\'

In [4]:
train = pd.read_csv(directory + 'train.csv')
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
test = pd.read_csv(directory + 'test.csv')

In [6]:
# drop a few columns that have very low data fill rate
#poor fill rate in train: Alley, FireplaceQu, PoolQC, Fence, MiscFeature
#poor fill rate in test: Alley, FireplaceQu, PoolQC, Fence, MiscFeature

In [7]:
train.drop(['Alley','FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'],axis=1,inplace=True)
test.drop(['Alley','FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'],axis=1,inplace=True)

In [8]:
#concatenate the two dfs and encode object colums

In [9]:

ntrain = len(train)
ntest = len(test)

ntrain, ntest

(1460, 1459)

In [10]:
concat_df = pd.concat([train, test],axis = 0)
concat_df.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,3,1Fam,TA,No,706.0,0.0,GLQ,...,WD,0,Pave,8,856.0,AllPub,0,2003,2003,2008
1,1262,0,0,3,1Fam,TA,Gd,978.0,0.0,ALQ,...,WD,0,Pave,6,1262.0,AllPub,298,1976,1976,2007
2,920,866,0,3,1Fam,TA,Mn,486.0,0.0,GLQ,...,WD,0,Pave,6,920.0,AllPub,0,2001,2002,2008
3,961,756,0,3,1Fam,Gd,No,216.0,0.0,ALQ,...,WD,0,Pave,7,756.0,AllPub,0,1915,1970,2006
4,1145,1053,0,4,1Fam,TA,Av,655.0,0.0,GLQ,...,WD,0,Pave,9,1145.0,AllPub,192,2000,2000,2008


In [11]:
#encode object cols
for feature in concat_df.columns:
    if concat_df[feature].dtype == 'object':
        concat_df[feature] = pd.Categorical(concat_df[feature]).codes

In [12]:
#split the concat back to train and test
final_train = concat_df[:ntrain]
final_test = concat_df[ntrain:]

In [13]:
features = list(concat_df.columns.values)

In [14]:
features.remove('Id')

In [15]:
y_train = final_train['SalePrice']

In [16]:
#step 1. use Sklearn XGB and Grid Search for param tuning

In [17]:
sklearn_xgb_fixed_params = {
                            'seed':0,
                            'objective': 'reg:linear',
                            'n_estimators': 1000
                            }

sklearn_xgb_params_to_be_tuned = {
                                'learning_rate':[0.01,0.1],
                                'subsample': [0.6,0.8,1.0],
                                'colsample_bytree': [0.6,0.8,1.0],
                                'max_depth': [3,5,7],
                                'min_child_weight': [1,3,5]
                                }

In [18]:
sklearn_xgb_model = XGBRegressor(**sklearn_xgb_fixed_params)

In [19]:
from datetime import datetime

In [20]:
from sklearn.grid_search import GridSearchCV

In [21]:
sklearn_xgb_model_Grid_Searched = GridSearchCV(sklearn_xgb_model,
                                               sklearn_xgb_params_to_be_tuned,
                                               cv = 5)

In [22]:
print datetime.now()

sklearn_xgb_model_Grid_Searched.fit(final_train[features], y_train) #note that the input is not a DMatrix

print datetime.now()

2016-12-18 21:57:46.457000
2016-12-18 22:20:35.898000


In [23]:
sklearn_xgb_model_Grid_Searched.best_params_

{'colsample_bytree': 1.0,
 'learning_rate': 0.01,
 'max_depth': 7,
 'min_child_weight': 1,
 'subsample': 1.0}

In [24]:
sklearn_xgb_model_Grid_Searched.best_score_

0.99962354931641018

In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
#train rmse:
np.sqrt(mean_squared_error(y_true=y_train, 
                           y_pred=sklearn_xgb_model_Grid_Searched.predict(final_train[features])))

79.695173801731769

In [28]:
#test preds for submission :

#for the sklearn model
submission = pd.read_csv(directory + 'sample_submission.csv')
submission.iloc[:, 1] = sklearn_xgb_model_Grid_Searched.predict(final_test[features])
submission.to_csv('Sklearn_XGB_Tuned_19Dec_Test.csv', index=None)


In [30]:
#this  submission scored 1.58948 which is magnitudes worse than the best submission 0.12 !! not sure why 
#my guess is that it is overfitting the train data

In [31]:
#use reg alpha to control overfitting
sklearn_xgb_fixed_params = {
                            'seed':0,
                            'objective': 'reg:linear',
                            'n_estimators': 1000
                            }

sklearn_xgb_params_to_be_tuned_with_reg = {
                                'learning_rate':[0.01,0.1],
                                'subsample': [0.6,0.8,1.0],
                                'colsample_bytree': [0.6,0.8,1.0],
                                'max_depth': [3,5,7],
                                'min_child_weight': [1,3,5],
                                'reg_alpha':[1e-3, 1e-2, 0.01, 0.1, 1]
                                }

In [32]:
sklearn_xgb_model2 = XGBRegressor(**sklearn_xgb_fixed_params)
sklearn_xgb_model_Grid_Searched2 = GridSearchCV(sklearn_xgb_model2,
                                               sklearn_xgb_params_to_be_tuned_with_reg,
                                               cv = 5)

In [33]:
print datetime.now()

sklearn_xgb_model_Grid_Searched2.fit(final_train[features], y_train) #note that the input is not a DMatrix

print datetime.now()

2016-12-18 23:29:45.519000
2016-12-19 01:28:46.135000


In [34]:
#train rmse:
np.sqrt(mean_squared_error(y_true=y_train, 
                           y_pred=sklearn_xgb_model_Grid_Searched2.predict(final_train[features])))

79.694041190517936

In [35]:
sklearn_xgb_model_Grid_Searched2.best_params_

{'colsample_bytree': 1.0,
 'learning_rate': 0.01,
 'max_depth': 7,
 'min_child_weight': 1,
 'reg_alpha': 0.1,
 'subsample': 1.0}

In [38]:
#test preds 2 for submission :

#for the sklearn model
submission = pd.read_csv(directory + 'sample_submission.csv')
submission.iloc[:, 1] = sklearn_xgb_model_Grid_Searched2.predict(final_test[features])
submission.to_csv('Sklearn_XGB_Tuned_3_19Dec_Test.csv', index=None)

In [39]:
#this scored.. 1.58958.. very bad