https://machinelearningmastery.com/super-learner-ensemble-in-python/

We will test this by running a regression model via Super Learner

### Functions and imports

In [1]:
# imports
from math import sqrt
from numpy import hstack
from numpy import vstack
from numpy import asarray
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR # support vector regression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
# create a list of base models
def get_models():
    models = list()
#     models.append(LinearRegression())
    models.append(ElasticNet())
    models.append(SVR(gamma='scale'))
    models.append(DecisionTreeRegressor())
    models.append(KNeighborsRegressor())
    models.append(AdaBoostRegressor())
    models.append(BaggingRegressor(n_estimators=10))
    models.append(RandomForestRegressor(n_estimators=10))
    models.append(ExtraTreesRegressor(n_estimators=10))
    return models

In [3]:
# collect out-of-fold predictions; form k-fold CV
def get_out_of_fold_predictions(X, y, models):
    meta_X, meta_y = list(), list()
    
    # define split of data
    kfold = KFold(n_splits=10, shuffle=True)
    
    # enumerate splits
    for train_ix, test_ix in kfold.split(X):
        fold_yhats = list()
        
        # get data
        train_X, test_X = X[train_ix], X[test_ix]
        train_y, test_y = y[train_ix], y[test_ix]
        meta_y.extend(test_y)
        
        # fit and make predictions with each sub-model
        for model in models:
            model.fit(train_X, train_y)
            yhat = model.predict(test_X)
            
            # store columns
            fold_yhats.append(yhat.reshape(len(yhat),1))
        
        # store fold yhats as columns
        meta_X.append(hstack(fold_yhats))
    return vstack(meta_X), asarray(meta_y)

In [4]:
# fit all base models on training dataset
def fit_base_models(X, y, models):
    for model in models:
        model.fit(X, y)

**I think it's important to note here that the selection of the meta model is a linear regression** 

In [5]:
# fit a meta model
def fit_meta_model(X, y):
    model = LinearRegression()
    model.fit(X, y)
    return model

In [6]:
# evaluate a list of models on a dataset
def evaluate_models(X, y, models):
    for model in models:
        yhat = model.predict(X)
        mse = mean_squared_error(y, yhat)
        print('%s: RMSE %.3f' % (model.__class__.__name__, sqrt(mse)))

In [7]:
# make predictions with stacked model
def super_learner_predictions(X, models, meta_model):
    meta_X = list()
    for model in models:
        yhat = model.predict(X)
        meta_X.append(yhat.reshape(len(yhat),1))
    meta_X = hstack(meta_X)
    return meta_model.predict(meta_X)

### Run

In [8]:
# create inputs and outputs (1000 samples of vectors of size 100)
X, y = make_regression(n_samples=1000, n_features=100, noise=0.5)

# split into training and test data
X, X_val, y, y_val = train_test_split(X, y, test_size=0.50)
print('Train', X.shape, y.shape, 'Test', X_val.shape, y_val.shape)

Train (500, 100) (500,) Test (500, 100) (500,)


In [9]:
models = get_models()

# get out-of-fold predictions
meta_X, meta_y = get_out_of_fold_predictions(X, y, models)
print('Meta ', meta_X.shape, meta_y.shape)

# fit base models
fit_base_models(X, y, models)

# fit the meta model
meta_model = fit_meta_model(meta_X, meta_y)

# eval base models
evaluate_models(X_val, y_val, models)

Meta  (500, 8) (500,)
ElasticNet: RMSE 66.031
SVR: RMSE 171.508
DecisionTreeRegressor: RMSE 174.009
KNeighborsRegressor: RMSE 155.923
AdaBoostRegressor: RMSE 91.943
BaggingRegressor: RMSE 110.207
RandomForestRegressor: RMSE 109.929
ExtraTreesRegressor: RMSE 100.907


In [10]:
# evaluate meta model
yhat = super_learner_predictions(X_val, models, meta_model)
print('Super Learner: RMSE %.3f' % (sqrt(mean_squared_error(y_val, yhat))))

Super Learner: RMSE 27.385


Aim is to minimise RMSE as much as possible to 0 (being aware of bias-variance tradeoff)

### Other

In [11]:
# X_val now gets mapped to yhat based on SuperLearner predictions and we compare predicted yhat against actual y_val
import pandas as pd

In [12]:
df = pd.DataFrame(yhat, index=list(range(1,501)))
df.columns = ["yhat"]
df["y_val"] = y_val
df["diff"] = df["yhat"] - df["y_val"]

In [13]:
df["diff"].describe() # summary stats for residuals between predicted (from SL) and actual

count    500.000000
mean      -0.184540
std       27.411683
min      -88.770952
25%      -19.031679
50%       -0.143634
75%       17.303063
max       83.706315
Name: diff, dtype: float64