In [227]:
from sklearn.linear_model import ElasticNet, LinearRegression as lr
from sklearn.ensemble import GradientBoostingRegressor as gbr, RandomForestRegressor as rfr
from xgboost import XGBRegressor

In [228]:
# Useful if you are debugging the function inside another .py script
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [229]:
import pandas as pd

houses_train = pd.read_csv('../Data/encoded_houses_train.csv')
houses_test = pd.read_csv('../Data/encoded_houses_test.csv')

In [230]:
X_train = houses_train.loc[:, houses_train.columns != "SalePrice"].values # convert to np.array
y_train = houses_train.loc[:, houses_train.columns == "SalePrice"].values.reshape(-1, )

X_test = houses_test.loc[:, houses_train.columns != "SalePrice"].values # convert to np.array

In [231]:
from stacking import stacking_regression
from sklearn.metrics import mean_squared_error
import numpy as np

In [232]:
from sklearn.metrics import mean_squared_log_error
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_log_error(y, y_pred))

In [233]:
models = [
#     # linear model, ElasticNet = lasso + ridge
#     ElasticNet(random_state=0, 
#               fit_intercept=True, alpha=0.18069, l1_ratio=0.01),
    
    # deep random forest model
#     rfr(random_state=0,
#         n_estimators=1000, max_depth=20,  max_features=70),
    
#     # aggressive random forest model
#     rfr(random_state=0, 
#         n_estimators=1500, max_depth=10,  max_features=75),
    
#     # conservative gbm model
#     gbr(random_state=0, learning_rate = 0.005, max_features='sqrt',
#         min_samples_leaf=15, min_samples_split=10, 
#         n_estimators=3000, max_depth=3),
    
#     # aggressive gbm model
#     gbr(random_state = 0, learning_rate = 0.01, max_features='sqrt',
#         min_samples_leaf=10, min_samples_split=5, 
#         n_estimators = 1000, max_depth = 9)
    
    XGBRegressor(max_depth=3, 
                 learning_rate=0.03, 
                 n_estimators=1700, # Number of boosted trees to fit
                 silent=True, # print messages while running 
                 objective='reg:linear', 
                 booster='gbtree', # Specify which booster to use: gbtree, gblinear or dart
                 n_jobs=-1, # Number of parallel threads used to run xgboost. (replaces nthread)
                 gamma=0,  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
                 min_child_weight=1, # Minimum sum of instance weight(hessian) needed in a child
                 max_delta_step=0, # Maximum delta step we allow each tree’s weight estimation to be
                 subsample=1, # Subsample ratio of the training instance
                 colsample_bytree=1, # Subsample ratio of columns when constructing each tree
                 colsample_bylevel=0.3, # Subsample ratio of columns for each split, in each level
                 reg_alpha=0, # L1 regularization term on weights
                 reg_lambda=1, # L2 regularization term on weights
                 scale_pos_weight=1, # Balancing of positive and negative weights
                 base_score=0.5, # The initial prediction score of all instances, global bias
                 random_state=743, 
                 missing=None),

    ]

meta_model = lr(normalize=True)

In [234]:
%%time
stacking_features, stacking_prediction = stacking_regression(models,  
                                                             X_train, y_train, X_test,
                                                             transform_target=np.log1p, 
                                                             transform_pred = np.expm1, 
                                                             metric=rmsle, verbose=2, n_folds=5)


metric: [rmsle]

using default dataset
model 0: [XGBRegressor]
    fold 0: [0.10726674]
    fold 1: [0.13252907]
    fold 2: [0.12698703]
    fold 3: [0.10451538]
    fold 4: [0.11525209]
    ----
    MEAN:   [0.11781538]

CPU times: user 43 s, sys: 318 ms, total: 43.3 s
Wall time: 43.5 s


In [235]:
print(stacking_features.shape)
print(stacking_prediction.shape)

(1460, 1)
(1459, 1)


In [236]:
stacking_features[0:5]
stacking_prediction[0:5]

array([[ 204999.015625],
       [ 184164.21875 ],
       [ 208079.203125],
       [ 184706.3125  ],
       [ 295874.3125  ]])

array([[ 123093.8203125],
       [ 157722.80625  ],
       [ 187823.58125  ],
       [ 192872.565625 ],
       [ 182545.7      ]])

Replace shallow RF with splines model

In [237]:
splines_features = np.array(pd.read_csv('../R/2017-11-12 17-34-10 splines_features.csv'))
splines_predictions = np.array(pd.read_csv('../R/2017-11-12 15-58-28 splines_predictions.csv'))

In [238]:
splines_features[0:3,1]
splines_predictions[0:3,1]

array([ 207566.93888495,  214669.16703789,  214696.62457309])

array([ 116475.83560003,  152685.11721187,  182211.34550559])

In [239]:
stacking_features[0:3,:]
stacking_prediction[0:3,:]

array([[ 204999.015625],
       [ 184164.21875 ],
       [ 208079.203125]])

array([[ 123093.8203125],
       [ 157722.80625  ],
       [ 187823.58125  ]])

In [240]:
stacking_features.shape
splines_features.shape
splines_features[:,1].shape
splines_predictions.shape

(1460, 1)

(1460, 2)

(1460,)

(1459, 2)

In [241]:
from stacking import append_meta_features

stacking_features, stacking_prediction = append_meta_features(stacking_features, 
                                                              stacking_prediction, 
                                                              splines_features[:,1].reshape(-1, 1),
                                                              splines_predictions[:,1].reshape(-1, 1))

In [242]:
stacking_features.shape

(1460, 2)

In [243]:
stacking_features[1:3,:]
stacking_prediction[1:3,:]

array([[ 184164.21875   ,  214669.16703789],
       [ 208079.203125  ,  214696.62457309]])

array([[ 157722.80625   ,  152685.11721187],
       [ 187823.58125   ,  182211.34550559]])

In [244]:
stacking_features[-5:,0]

array([ 174340.171875,  197251.84375 ,  262363.34375 ,  143586.875   ,
        153073.3125  ])

In [245]:
stacking_features[-5:,1]

array([ 182136.47124518,  216730.33285582,  243911.84142698,
        143447.24443165,  149231.1106964 ])

In [246]:
import sklearn.model_selection as ms

def averaging_score(x, y):
    return rmsle(x.mean(axis=1), y)

averaging_score(stacking_features, y_train)

stacking_features.mean(axis=1), y_train
np.sqrt(np.mean((np.log(stacking_features.mean(axis=1))-np.log(y_train))**2))

np.sqrt(np.mean((np.log(stacking_features[:,0])-np.log(y_train))**2))
np.sqrt(np.mean((np.log(stacking_features[:,1])-np.log(y_train))**2))

0.11439506632484021

(array([ 206282.97725498,  199416.69289395,  211387.91384905, ...,
         253137.59258849,  143517.05971582,  151152.2115982 ]),
 array([ 208500.,  181500.,  223500., ...,  266500.,  142125.,  147500.]))

0.11439596577593641

0.11781629738643835

0.12438846250408098

In [247]:
np.sqrt(np.mean((np.log(stacking_features[:,1])-np.log(y_train))**2))

0.12438846250408098

One way to evaluate a meta-model: cross_val_score

In [248]:
meta_model = gbr(random_state = 0, learning_rate = 0.01, max_features='sqrt',
                 min_samples_leaf=10, min_samples_split=5, 
                 n_estimators = 1000, max_depth = 9)

In [249]:
score = ms.cross_val_score(estimator=meta_model, X= stacking_features, y=y_train, cv=5, scoring='neg_mean_squared_log_error')
print(np.sqrt(score * -1))
print(np.mean(np.sqrt(score * -1)))

[ 0.11372052  0.13899879  0.1271798   0.11510308  0.12365745]
0.123731928536


another way -- using gridsearchCV to tune params, and use the crossval score from best param set

In [250]:
param_grid = { "n_estimators"      : [500, 750],
           "max_features"      : range(1, 2),
           "max_depth"         : range(1, 6)}
meta_model.set_params(random_state=67)
grid_search_tree = ms.GridSearchCV(meta_model, param_grid, cv=5, n_jobs=-1, verbose=10, scoring='neg_mean_squared_log_error')
%time grid_search_tree.fit(stacking_features, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=9,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=10, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             presort='auto', random_state=67, subsample=1.0, verbose=0,
             warm_start=False)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] max_depth=1, max_features=1, n_estimators=500 ...................
[CV] max_depth=1, max_features=1, n_estimators=500 ...................
[CV] max_depth=1, max_features=1, n_estimators=500 ...................
[CV] max_depth=1, max_features=1, n_estimators=500 ...................
[CV]  max_depth=1, max_features=1, n_estimators=500, score=-0.02279496719401029, total=   0.3s
[CV]  max_depth=1, max_features=1, n_estimators=500, score=-0.016067991047111596, total=   0.4s
[CV]  max_depth=1, max_features=1, n_estimators=500, score=-0.029404134664068236, total=   0.4s
[CV] max_depth=1, max_features=1, n_estimators=500 ...................
[CV] max_depth=1, max_features=1, n_estimators=750 ...................
[CV]  max_depth=1, max_features=1, n_estimators=500, score=-0.022120623227422895, total=   0.4s
[CV] max_depth=1, max_features=1, n_estimators=750 ...................
[CV] max_depth=1, max_features=1, n_estimators=750 ........

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.7s


[CV]  max_depth=1, max_features=1, n_estimators=750, score=-0.015657489896969093, total=   0.4s
[CV] max_depth=2, max_features=1, n_estimators=500 ...................
[CV]  max_depth=2, max_features=1, n_estimators=500, score=-0.011042753526138021, total=   0.4s
[CV] max_depth=2, max_features=1, n_estimators=500 ...................
[CV]  max_depth=2, max_features=1, n_estimators=500, score=-0.01956911682723461, total=   0.4s
[CV] max_depth=2, max_features=1, n_estimators=500 ...................
[CV]  max_depth=1, max_features=1, n_estimators=750, score=-0.015240952291279862, total=   0.5s
[CV] max_depth=2, max_features=1, n_estimators=750 ...................


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.3s


[CV]  max_depth=2, max_features=1, n_estimators=500, score=-0.015896489836927208, total=   0.5s
[CV] max_depth=2, max_features=1, n_estimators=750 ...................
[CV]  max_depth=2, max_features=1, n_estimators=500, score=-0.012815238273745684, total=   0.5s
[CV] max_depth=2, max_features=1, n_estimators=750 ...................
[CV]  max_depth=2, max_features=1, n_estimators=500, score=-0.013594296969201735, total=   0.5s
[CV] max_depth=2, max_features=1, n_estimators=750 ...................
[CV]  max_depth=2, max_features=1, n_estimators=750, score=-0.011166975751634526, total=   0.6s
[CV] max_depth=2, max_features=1, n_estimators=750 ...................
[CV]  max_depth=2, max_features=1, n_estimators=750, score=-0.018954405044877852, total=   0.5s
[CV] max_depth=3, max_features=1, n_estimators=500 ...................
[CV]  max_depth=2, max_features=1, n_estimators=750, score=-0.01591138716769468, total=   0.5s
[CV] max_depth=3, max_features=1, n_estimators=500 ...................

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.2s


[CV]  max_depth=2, max_features=1, n_estimators=750, score=-0.013680086633793208, total=   0.5s
[CV] max_depth=3, max_features=1, n_estimators=500 ...................
[CV]  max_depth=3, max_features=1, n_estimators=500, score=-0.011278137943938992, total=   0.5s
[CV] max_depth=3, max_features=1, n_estimators=500 ...................
[CV]  max_depth=3, max_features=1, n_estimators=500, score=-0.0189499325904257, total=   0.5s
[CV]  max_depth=3, max_features=1, n_estimators=500, score=-0.01564744435888353, total=   0.4s
[CV] max_depth=3, max_features=1, n_estimators=750 ...................
[CV] max_depth=3, max_features=1, n_estimators=750 ...................
[CV]  max_depth=3, max_features=1, n_estimators=500, score=-0.012125545137926711, total=   0.4s
[CV] max_depth=3, max_features=1, n_estimators=750 ...................
[CV]  max_depth=3, max_features=1, n_estimators=500, score=-0.013627498938300802, total=   0.4s
[CV] max_depth=3, max_features=1, n_estimators=750 ...................


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.0s


[CV]  max_depth=3, max_features=1, n_estimators=750, score=-0.011401256047427458, total=   0.6s
[CV] max_depth=3, max_features=1, n_estimators=750 ...................
[CV]  max_depth=3, max_features=1, n_estimators=750, score=-0.018732414465177382, total=   0.6s
[CV] max_depth=4, max_features=1, n_estimators=500 ...................
[CV]  max_depth=3, max_features=1, n_estimators=750, score=-0.015676831824600522, total=   0.6s
[CV] max_depth=4, max_features=1, n_estimators=500 ...................
[CV]  max_depth=3, max_features=1, n_estimators=750, score=-0.012110797417931708, total=   0.5s
[CV] max_depth=4, max_features=1, n_estimators=500 ...................
[CV]  max_depth=4, max_features=1, n_estimators=500, score=-0.011337494996882597, total=   0.4s
[CV] max_depth=4, max_features=1, n_estimators=500 ...................
[CV]  max_depth=3, max_features=1, n_estimators=750, score=-0.013811787794995268, total=   0.6s
[CV] max_depth=4, max_features=1, n_estimators=500 ..................

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.3s


[CV] max_depth=4, max_features=1, n_estimators=750 ...................
[CV]  max_depth=4, max_features=1, n_estimators=500, score=-0.0136668584522407, total=   0.6s
[CV] max_depth=4, max_features=1, n_estimators=750 ...................
[CV]  max_depth=4, max_features=1, n_estimators=750, score=-0.0115265639129984, total=   0.7s
[CV] max_depth=4, max_features=1, n_estimators=750 ...................
[CV]  max_depth=4, max_features=1, n_estimators=750, score=-0.01861733509431819, total=   0.7s
[CV] max_depth=5, max_features=1, n_estimators=500 ...................
[CV]  max_depth=4, max_features=1, n_estimators=750, score=-0.015639963322369378, total=   0.5s
[CV] max_depth=5, max_features=1, n_estimators=500 ...................
[CV]  max_depth=4, max_features=1, n_estimators=750, score=-0.011888848467420969, total=   0.6s
[CV] max_depth=5, max_features=1, n_estimators=500 ...................
[CV]  max_depth=5, max_features=1, n_estimators=500, score=-0.011430718594326397, total=   0.7s
[CV

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.8s


[CV] max_depth=5, max_features=1, n_estimators=750 ...................
[CV]  max_depth=5, max_features=1, n_estimators=500, score=-0.012041314866891995, total=   0.6s
[CV]  max_depth=5, max_features=1, n_estimators=500, score=-0.013947590671523314, total=   0.6s
[CV] max_depth=5, max_features=1, n_estimators=750 ...................
[CV] max_depth=5, max_features=1, n_estimators=750 ...................
[CV]  max_depth=5, max_features=1, n_estimators=750, score=-0.011657109358759373, total=   0.8s
[CV] max_depth=5, max_features=1, n_estimators=750 ...................
[CV]  max_depth=5, max_features=1, n_estimators=750, score=-0.018633138603155703, total=   0.8s
[CV]  max_depth=5, max_features=1, n_estimators=750, score=-0.01571053988512303, total=   0.7s
[CV]  max_depth=5, max_features=1, n_estimators=750, score=-0.012138390242397198, total=   0.7s
[CV]  max_depth=5, max_features=1, n_estimators=750, score=-0.014200750497720405, total=   0.4s


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    7.2s finished


CPU times: user 445 ms, sys: 162 ms, total: 607 ms
Wall time: 7.67 s


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=9,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=10, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             presort='auto', random_state=67, subsample=1.0, verbose=0,
             warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [500, 750], 'max_features': range(1, 2), 'max_depth': range(1, 6)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_log_error', verbose=10)

In [251]:
grid_search_tree.best_params_

{'max_depth': 4, 'max_features': 1, 'n_estimators': 500}

In [252]:
np.sqrt(grid_search_tree.best_score_ * -1)

0.1192973994131904

In [270]:
stacking_prediction_model = grid_search_tree.predict(stacking_prediction)
stacking_prediction_model

array([ 122761.86713292,  155040.28695736,  177587.87086525, ...,
        162191.93001178,  120129.61098835,  214317.4667008 ])

In [255]:
stacking_prediction[0:10,]

array([[ 123093.8203125 ,  116475.83560003],
       [ 157722.80625   ,  152685.11721187],
       [ 187823.58125   ,  182211.34550559],
       [ 192872.565625  ,  200004.35554143],
       [ 182545.7       ,  199303.35548476],
       [ 172196.946875  ,  172792.20986824],
       [ 174421.21875   ,  178415.4691187 ],
       [ 168104.346875  ,  164221.24539806],
       [ 185497.56875   ,  190698.41061584],
       [ 127708.315625  ,  119437.52758844]])

In [260]:
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,119784.827956
1,1462,155203.961731
2,1463,185017.463378
3,1464,196438.460583
4,1465,190924.527742


In [267]:
submission = pd.DataFrame({"Id": range(1461, 2920), 
                            "SalePrice": stacking_prediction_model})   # values

In [268]:
import datetime
time = '{:%Y-%m-%d %H %M}'.format(datetime.datetime.now())
time

'2017-11-12 18 00'

In [269]:
submission.to_csv("./Ensemble Submissions/submission {}.csv".format(time), sep=',', index = False)