In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso
from math import sqrt



In [27]:
TARGET = 'SalePrice'
NFOLDS = 5
SEED = 0
NROWS = None
SUBMISSION_FILE = '/Users/Leslie/GitHub/HousePrice/input/sample_submission.csv'

## Load Data

In [44]:
train = pd.read_csv("/Users/Leslie/GitHub/HousePrice/train.csv")
test = pd.read_csv("/Users/Leslie/GitHub/HousePrice/test.csv")
ntrain = train.shape[0]
ntest = test.shape[0]

## Preprocessing

In [45]:
# y_train = np.log(train[TARGET]+1)
y_train = train[TARGET]
train.drop([TARGET], axis=1, inplace=True)
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

## Logarithm transformation on right skewed features

In [46]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)

In [47]:
all_data = all_data.fillna(all_data.mean())

x_train = np.array(all_data[:train.shape[0]])
x_test = np.array(all_data[train.shape[0]:])

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [48]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [49]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


In [50]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
    'nrounds': 500
}



rd_params={
    'alpha': 10
}


ls_params={
    'alpha': 0.005
}


xg = XgbWrapper(seed=SEED, params=xgb_params)

In [51]:
xg_oof_train, xg_oof_test = get_oof(xg)
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))


XG-CV: 25831.301777220426


In [52]:
res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

[0]	train-rmse:10.6684+0.00673027	test-rmse:10.6684+0.0227829
[10]	train-rmse:4.90936+0.00261722	test-rmse:4.90955+0.0223482
[20]	train-rmse:2.26833+0.00167086	test-rmse:2.26815+0.0166228
[30]	train-rmse:1.05824+0.00170887	test-rmse:1.05855+0.0133175
[40]	train-rmse:0.50419+0.000822533	test-rmse:0.509794+0.0126884
[50]	train-rmse:0.255717+0.00122875	test-rmse:0.269112+0.0118271
[60]	train-rmse:0.149789+0.00182402	test-rmse:0.174855+0.0113745
[70]	train-rmse:0.108866+0.00202226	test-rmse:0.143844+0.0111975
[80]	train-rmse:0.0931667+0.00215988	test-rmse:0.134059+0.0109572
[90]	train-rmse:0.085783+0.00224862	test-rmse:0.130559+0.0111554
[100]	train-rmse:0.0806975+0.00210502	test-rmse:0.128605+0.0110318
[110]	train-rmse:0.0765357+0.00218548	test-rmse:0.127402+0.0109773
[120]	train-rmse:0.0731757+0.00191427	test-rmse:0.126657+0.0109343
[130]	train-rmse:0.0701193+0.0018936	test-rmse:0.125894+0.0111663
[140]	train-rmse:0.0672485+0.00180144	test-rmse:0.125288+0.0111493
[150]	train-rmse:0.06477

In [53]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(dtest)
# saleprice = np.exp(submission['SalePrice'])-1
# submission['SalePrice'] = saleprice
submission.to_csv('/Users/Leslie/GitHub/HousePrice/input/kaggle_kernel_sub1.csv', index=None)