In [67]:
from sklearn.linear_model import ElasticNet, LinearRegression as lr
from sklearn.ensemble import GradientBoostingRegressor as gbr, RandomForestRegressor as rfr
from xgboost import XGBRegressor

In [68]:
# Useful if you are debugging the function inside another .py script
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [69]:
import pandas as pd

houses_train = pd.read_csv('../Data/encoded_houses_train.csv')
houses_test = pd.read_csv('../Data/encoded_houses_test.csv')

In [70]:
X_train = houses_train.loc[:, houses_train.columns != "SalePrice"].values # convert to np.array
y_train = houses_train.loc[:, houses_train.columns == "SalePrice"].values.reshape(-1, )

X_test = houses_test.loc[:, houses_train.columns != "SalePrice"].values # convert to np.array

In [71]:
from stacking import stacking_regression
from sklearn.metrics import mean_squared_error
import numpy as np

In [72]:
from sklearn.metrics import mean_squared_log_error
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_log_error(y, y_pred))

In [None]:
models = [
    # linear model, ElasticNet = lasso + ridge
    ElasticNet(random_state=0, 
              fit_intercept=True, alpha=0.18069, l1_ratio=0.01),
    
    # deep random forest model
    rfr(random_state=0,
        n_estimators=1000, max_depth=20,  max_features=70),
    
    # aggressive random forest model
    rfr(random_state=0, 
        n_estimators=1500, max_depth=10,  max_features=75),
    
    # conservative gbm model
    gbr(random_state=0, learning_rate = 0.005, max_features='sqrt',
        min_samples_leaf=15, min_samples_split=10, 
        n_estimators=3000, max_depth=3),
    
    # aggressive gbm model
    gbr(random_state = 0, learning_rate = 0.01, max_features='sqrt',
        min_samples_leaf=10, min_samples_split=5, 
        n_estimators = 1000, max_depth = 9)
    ]

meta_model = lr(normalize=True)

In [None]:
%%time
stacking_features, stacking_prediction = stacking_regression(models, 
                                          rfr(random_state=0,
                                              n_estimators=1000, 
                                              max_depth=5,  
                                              max_features=2), 
                                          X_train, y_train, X_test,
                                          transform_target=np.log1p, transform_pred = np.expm1, 
                                          metric=rmsle, verbose=2, n_folds=5)

metric: [rmsle]

model 0: [ElasticNet]
    fold 0: [0.11174785]
    fold 1: [0.14402997]
    fold 2: [0.12856209]
    fold 3: [0.11664593]
    fold 4: [0.15702438]
    ----
    MEAN:   [0.13268259]

model 1: [RandomForestRegressor]
    fold 0: [0.11933864]
    fold 1: [0.14926581]
    fold 2: [0.14516046]
    fold 3: [0.12589981]
    fold 4: [0.13240177]
    ----
    MEAN:   [0.13488851]

model 2: [RandomForestRegressor]
    fold 0: [0.12064394]
    fold 1: [0.15028258]
    fold 2: [0.14689157]
    fold 3: [0.12692987]


In [None]:
print(stacking_features.shape)
print(stacking_prediction.shape)

In [None]:
stacking_prediction[0:5]

Replace shallow RF with splines model

In [None]:
splines_features = np.array(pd.read_csv('../R/2017-11-12 14-38-24 splines_features.csv'))

In [None]:
splines_features[0:3,1]

In [None]:
stacking_features[0:3,:]

In [None]:
stacking_features[:,2] = splines_features[:,1]

In [None]:
stacking_features[0:3,:]

In [None]:
import sklearn.model_selection as ms

def averaging_score(x, y):
    return rmsle(x.mean(axis=1), y)

averaging_score(stacking_features, y_train)

One way to evaluate a meta-model: cross_val_score

In [None]:
meta_model = rfr(random_state=0, n_estimators=500, max_depth=1, max_features=2)

In [None]:
score = ms.cross_val_score(estimator=meta_model, X= stacking_features, y=y_train, cv=5, scoring='neg_mean_squared_log_error')
print(np.sqrt(score * -1))
print(np.mean(np.sqrt(score * -1)))

another way -- using gridsearchCV to tune params, and use the crossval score from best param set

In [None]:
param_grid = { "n_estimators"      : [500, 750],
           "max_features"      : range(1, 6),
           "max_depth"         : range(1, 6)}
meta_model.set_params(random_state=67)
grid_search_tree = ms.GridSearchCV(meta_model, param_grid, cv=5, n_jobs=-1, verbose=10, scoring='neg_mean_squared_log_error')
%time grid_search_tree.fit(stacking_features, y_train)

In [None]:
grid_search_tree.best_params_

In [None]:
np.sqrt(grid_search_tree.best_score_ * -1)