# Stacking Models

<table align="left"><td>
  <a target="_blank"  href="https://colab.research.google.com/github/TannerGilbert/Tutorials/blob/master/A%20guide%20to%20Ensemble%C2%A0Learning/A%20guide%20to%20Ensemble%20Learning.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab
  </a>
</td><td>
  <a target="_blank"  href="https://github.com/TannerGilbert/Tutorials/blob/master/A%20guide%20to%20Ensemble%C2%A0Learning/A%20guide%20to%20Ensemble%20Learning.ipynb">
    <img width=32px src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
</td></table>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pathlib
from scipy import stats
from scipy.stats import norm, skew

from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeRegressor

import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

## Load in data-sets

In [None]:
# Install Kaggle API
!pip install -q kaggle
!pip install -q kaggle-cli

[K     |████████████████████████████████| 81kB 3.2MB/s 
[K     |████████████████████████████████| 4.2MB 4.7MB/s 
[K     |████████████████████████████████| 112kB 39.3MB/s 
[K     |████████████████████████████████| 143kB 34.4MB/s 
[K     |████████████████████████████████| 51kB 4.6MB/s 
[?25h  Building wheel for kaggle-cli (setup.py) ... [?25l[?25hdone
  Building wheel for lxml (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for lxml[0m
[?25h  Building wheel for PrettyTable (setup.py) ... [?25l[?25hdone
  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone
    Running setup.py install for lxml ... [?25l[?25herror
[31mERROR: Command errored out with exit status 1: /usr/bin/python3 -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-yi8bf154/lxml/setup.py'"'"'; __file__='"'"'/tmp/pip-install-yi8bf154/lxml/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.clos

In [None]:
# only for google colab
import os
os.environ['KAGGLE_USERNAME'] = "<username>" 
os.environ['KAGGLE_KEY'] = "<key>"

To download the data from the competition you need to sign the rules first.

In [None]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

Downloading train.csv to /content
  0% 0.00/450k [00:00<?, ?B/s]
100% 450k/450k [00:00<00:00, 55.9MB/s]
Downloading data_description.txt to /content
  0% 0.00/13.1k [00:00<?, ?B/s]
100% 13.1k/13.1k [00:00<00:00, 12.7MB/s]
Downloading test.csv to /content
  0% 0.00/441k [00:00<?, ?B/s]
100% 441k/441k [00:00<00:00, 63.3MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/31.2k [00:00<?, ?B/s]
100% 31.2k/31.2k [00:00<00:00, 42.6MB/s]


In [None]:
path = pathlib.Path('/content')

In [None]:
# Load in the train and test dataset
train = pd.read_csv(path/'train.csv')
test = pd.read_csv(path/'test.csv')

train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,...,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,...,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,...,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,...,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,...,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,...,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [None]:
train.isnull().sum().sort_values(ascending=False)

PoolQC           1453
MiscFeature      1406
Alley            1369
Fence            1179
FireplaceQu       690
                 ... 
CentralAir          0
SaleCondition       0
Heating             0
TotalBsmtSF         0
Id                  0
Length: 81, dtype: int64

In [None]:
# Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']

# Drop id column because is is unnecessary for making predictions
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

## Feature Engineering
Based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

In [None]:
train['SalePrice'] = np.log1p(train['SalePrice'])

In [None]:
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)
print('all_data size is: {}'.format(all_data.shape))

all_data size is: (2919, 79)


### Missing data

In [None]:
all_data_na = (all_data.isnull().sum() / len(all_data))*100
all_data_na = all_data_na.drop(all_data_na[all_data_na==0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ration': all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ration
PoolQC,99.657417
MiscFeature,96.402878
Alley,93.216855
Fence,80.438506
FireplaceQu,48.646797
LotFrontage,16.649538
GarageFinish,5.447071
GarageYrBlt,5.447071
GarageQual,5.447071
GarageCond,5.447071


### Imputing missing values

In [None]:
all_data['PoolQC'] = all_data['PoolQC'].fillna('None')
all_data['MiscFeature'] = all_data['MiscFeature'].fillna('None')
all_data['Alley'] = all_data['Alley'].fillna('None')
all_data['Fence'] = all_data['Fence'].fillna('None')
all_data['FireplaceQu'] = all_data['FireplaceQu'].fillna('None')
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')
all_data['MasVnrType'] = all_data['MasVnrType'].fillna('None')
all_data['MasVnrArea'] = all_data['MasVnrArea'].fillna(0)
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
all_data = all_data.drop(['Utilities'], axis=1)
all_data['Functional'] = all_data['Functional'].fillna('Typ')
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
all_data['MSSubClass'] = all_data['MSSubClass'].fillna('None')

In [None]:
all_data_na = (all_data.isnull().sum() / len(all_data))*100
all_data_na = all_data_na.drop(all_data_na[all_data_na==0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio':all_data_na})
missing_data.head()

Unnamed: 0,Missing Ratio


In [None]:
train = all_data[:ntrain]
test = all_data[ntrain:]
train.to_csv(path/'train_cleaned.csv')
test.to_csv(path/'test_cleaned.csv')

### Transforming some numerical features to categorical

In [None]:
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)

all_data['OverallCond'] = all_data['OverallCond'].astype(str)

all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')

for c in cols:
    lbl = LabelEncoder()
    lbl.fit(list(all_data[c].values))
    all_data[c] = lbl.transform(list(all_data[c].values))
    
print('Shape all_data {}'.format(all_data.shape))

Shape all_data (2919, 78)


In [None]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [None]:
numeric_feats = all_data.dtypes[all_data.dtypes!='object'].index

# Check the skew of all the numeric features
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print('Skew in numeric features:')
skewness = pd.DataFrame({'Skew': skewed_feats})
skewness.head(10)

Skew in numeric features:


Unnamed: 0,Skew
MiscVal,21.947195
PoolArea,16.898328
LotArea,12.822431
LowQualFinSF,12.088761
3SsnPorch,11.376065
LandSlope,4.975157
KitchenAbvGr,4.302254
BsmtFinSF2,4.146143
EnclosedPorch,4.003891
ScreenPorch,3.946694


In [None]:
skewness = skewness[abs(skewness) > 0.75]
print('There are {} skewed numerical features to Box Cox transform'.format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)

There are 59 skewed numerical features to Box Cox transform


In [None]:
print(all_data.shape)
all_data = pd.get_dummies(all_data)
print(all_data.shape)

(2919, 79)
(2919, 221)


In [None]:
train = all_data[:ntrain]
test = all_data[ntrain:]

In [None]:
train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,...,Foundation_Stone,Foundation_Wood,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,MiscFeature_Gar2,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,2.885846,5.831328,19.212182,0.730463,0.730463,1.540963,0.0,2.440268,1.820334,14.187527,14.187527,8.059126,1.194318,1.820334,1.194318,1.820334,1.540963,1.194318,11.170327,2.259674,0.0,7.483296,11.692623,0.0,0.730463,11.692623,11.686189,0.0,13.698888,0.730463,0.0,1.194318,0.730463,1.540963,0.730463,1.194318,2.602594,2.259674,0.0,1.540963,...,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,2.055642,6.221214,19.712205,0.730463,0.730463,1.540963,0.0,2.259674,2.440268,14.145138,14.145138,0.0,1.540963,1.820334,1.194318,1.820334,0.730463,0.0,12.062832,2.259674,0.0,8.897844,12.792276,0.0,0.730463,12.792276,0.0,0.0,12.792276,0.0,0.730463,1.194318,0.0,1.540963,0.730463,1.540963,2.259674,2.259674,0.730463,2.055642,...,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,2.885846,5.91494,20.347241,0.730463,0.730463,0.0,0.0,2.440268,1.820334,14.184404,14.185966,7.646538,1.194318,1.820334,1.194318,1.820334,1.194318,1.194318,10.200343,2.259674,0.0,9.91706,11.892039,0.0,0.730463,11.892039,11.724598,0.0,13.832085,0.730463,0.0,1.194318,0.730463,1.540963,0.730463,1.194318,2.259674,2.259674,0.730463,2.055642,...,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,3.01134,5.684507,19.691553,0.730463,0.730463,0.0,0.0,2.440268,1.820334,14.047529,14.135652,0.0,1.540963,1.820334,1.820334,0.730463,1.540963,0.0,8.274266,2.259674,0.0,10.4685,11.354094,1.194318,0.730463,12.013683,11.354094,0.0,13.711364,0.730463,0.0,0.730463,0.0,1.540963,0.730463,1.194318,2.440268,2.259674,0.730463,1.194318,...,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
4,2.885846,6.314735,21.32516,0.730463,0.730463,0.0,0.0,2.602594,1.820334,14.182841,14.182841,9.391827,1.194318,1.820334,1.194318,1.820334,0.0,1.194318,10.971129,2.259674,0.0,10.221051,12.510588,0.0,0.730463,12.510588,12.271365,0.0,14.480029,0.730463,0.0,1.194318,0.730463,1.820334,0.730463,1.194318,2.75025,2.259674,0.730463,2.055642,...,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [None]:
train.dtypes.unique()

array([dtype('float64'), dtype('uint8')], dtype=object)

In [None]:
y_train

array([12.24769912, 12.10901644, 12.31717117, ..., 12.49313327,
       11.86446927, 11.90159023])

In [None]:
X = train.values

## Accuracy function

In [None]:
from sklearn.model_selection import KFold, cross_val_score

n_folds = 5

def get_cv_scores(model, X, y, print_scores=True):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = kf))
    if print_scores:
        print(f'Root mean squared error: {rmse.mean():.3f} ({rmse.std():.3f})')
    return [rmse]

## Base Models

### Lasso

In [None]:
lasso_model = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))

In [None]:
%%time
get_cv_scores(lasso_model, X, y_train)

Root mean squared error: 0.124 (0.016)
CPU times: user 893 ms, sys: 717 ms, total: 1.61 s
Wall time: 981 ms


[array([0.10361373, 0.13480373, 0.12784782, 0.10698808, 0.14677305])]

In [None]:
lasso_model.fit(X, y_train)
lasso_model.predict([X[0]])

array([12.23426999])

In [None]:
y_train[0]

12.24769911637256

In [None]:
lasso_model.score(X, y_train)

0.9285263568332836

### DecisionTreeRegresor

In [None]:
%%time
dt = DecisionTreeRegressor()
get_cv_scores(dt, X, y_train);

Root mean squared error: 0.212 (0.023)
CPU times: user 273 ms, sys: 84.4 ms, total: 358 ms
Wall time: 288 ms


### RandomForestRegressor

In [None]:
rf = RandomForestRegressor()

In [None]:
%%time
get_cv_scores(rf, X, y_train);

Root mean squared error: 0.143 (0.007)
CPU times: user 13.2 s, sys: 31.8 ms, total: 13.2 s
Wall time: 13.2 s


[array([0.13682295, 0.15465907, 0.14514017, 0.1336972 , 0.14443453])]

### GradientBoostingRegressor

In [None]:
gbr = GradientBoostingRegressor()

In [None]:
%%time
get_cv_scores(gbr, X, y_train);

Root mean squared error: 0.125 (0.008)
CPU times: user 4.97 s, sys: 4.8 ms, total: 4.97 s
Wall time: 4.97 s


[array([0.11479306, 0.13346123, 0.13440176, 0.11796313, 0.12488726])]

In [None]:
gbr = GradientBoostingRegressor()
gbr.fit(X, y_train)
gbr.score(X, y_train)

0.9581780008371052

### XGBoost

In [None]:
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror')

In [None]:
%%time
get_cv_scores(xgb_model, X, y_train);

Root mean squared error: 0.130 (0.007)
CPU times: user 3.18 s, sys: 34.9 ms, total: 3.21 s
Wall time: 3.33 s


[array([0.11982543, 0.14036859, 0.1326639 , 0.125978  , 0.13039583])]

### LightGBM

In [None]:
lgb_model = lgb.LGBMRegressor()

In [None]:
%%time
get_cv_scores(lgb_model, X, y_train);

Root mean squared error: 0.133 (0.009)
CPU times: user 1.75 s, sys: 48.8 ms, total: 1.8 s
Wall time: 1.83 s


[array([0.12484455, 0.14604628, 0.13847223, 0.12293996, 0.13329926])]

## Stacking Models

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin, clone, RegressorMixin

In [None]:
# based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
# and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches
class AveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [None]:
class WeightedAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models, weights):
        self.models = models
        self.weights = weights
        assert sum(self.weights)==1
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.sum(predictions*self.weights, axis=1)

In [None]:
class BaggingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]

        for model in self.models_:
            X_tmp, y_tmp = self.subsample(X, y)
            model.fit(X_tmp, y_tmp)
        
        return self
            
    # Create a random subsample from the dataset with replacement
    def subsample(self, X, y, ratio=1.0):
        X_new, y_new = list(), list()
        n_sample = round(len(X) * ratio)
        while len(X_new) < n_sample:
            index = np.random.randint(len(X))
            X_new.append(X[index])
            y_new.append(y[index])
        return X_new, y_new
    
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [None]:
#class BoostingModels(BaseEstimator, RegressorMixin, TransformerMixin):
#    def __init__(self, base_models)

In [None]:
# based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
# and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5, use_features_in_secondary=False):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        self.use_features_in_secondary = use_features_in_secondary
        
    def fit(self, X, y):
        """Fit all the models on the given dataset"""
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        
        # Train cloned base models and create out-of-fold predictions
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
        
        if self.use_features_in_secondary:
            self.meta_model_.fit(np.hstack((X, out_of_fold_predictions)), y)
        else:
            self.meta_model_.fit(out_of_fold_predictions, y)
            
        return self
    
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        if self.use_features_in_secondary:
            return self.meta_model_.predict(np.hstack((X, meta_features)))
        else:
            return self.meta_model_.predict(meta_features)
    
    def predict_proba(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        if self.use_features_in_secondary:
            return self.meta_model_.predict_proba(np.hstack((X, meta_features)))
        else:
            return self.meta_model_.predict_proba(meta_features)

In [None]:
# based on https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
# and https://www.kaggle.com/eikedehling/trying-out-stacking-approaches
class StackingModelsRetrained(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5, use_features_in_secondary=False):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        self.use_features_in_secondary = use_features_in_secondary
        
    def fit(self, X, y):
        """Fit all the models on the given dataset"""
        self.base_models_ = [clone(x) for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        
        # Train cloned base models and create out-of-fold predictions
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
        
        if self.use_features_in_secondary:
            self.meta_model_.fit(np.hstack((X, out_of_fold_predictions)), y)
        else:
            self.meta_model_.fit(out_of_fold_predictions, y)
            
        for model in self.base_models_:
            model.fit(X, y)
            
        return self
    
    def predict(self, X):
        meta_features = np.column_stack([
            base_model.predict(X) for base_model in self.base_models_ ])
        if self.use_features_in_secondary:
            return self.meta_model_.predict(np.hstack((X, meta_features)))
        else:
            return self.meta_model_.predict(meta_features)
    
    def predict_proba(self, X):
        meta_features = np.column_stack([
            base_model.predict(X) for base_model in self.base_models_ ])
        if self.use_features_in_secondary:
            return self.meta_model_.predict_proba(np.hstack((X, meta_features)))
        else:
            return self.meta_model_.predict_proba(meta_features)

In [None]:
from sklearn.model_selection import train_test_split

class BlendingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, holdout_pct=0.2, use_features_in_secondary=False):
        self.base_models = base_models
        self.meta_model = meta_model
        self.holdout_pct = holdout_pct
        self.use_features_in_secondary = use_features_in_secondary
        
    def fit(self, X, y):
        """Fit all the models on the given dataset"""
        self.base_models_ = [clone(x) for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        
        X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=self.holdout_pct)
                
        holdout_predictions = np.zeros((X_holdout.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models_):
            model.fit(X_train, y_train)
            y_pred = model.predict(X_holdout)
            holdout_predictions[:, i] = y_pred
        if self.use_features_in_secondary:
            self.meta_model_.fit(np.hstack((X_holdout, holdout_predictions)), y_holdout)
        else:
            self.meta_model_.fit(holdout_predictions, y_holdout)
            
        return self
    
    def predict(self, X):
        meta_features = np.column_stack([
            model.predict(X) for model in self.base_models_
        ])
        if self.use_features_in_secondary:
            return self.meta_model_.predict(np.hstack((X, meta_features)))
        else:
            return self.meta_model_.predict(meta_features)

## Trying Stacking with different Sub- and Aggregating Models

In [None]:
%%time
averaged_model1 = AveragedModels([gbr, lasso_model, xgb_model])
get_cv_scores(averaged_model1, X, y_train);

Root mean squared error: 0.119 (0.009)
CPU times: user 9.13 s, sys: 1.15 s, total: 10.3 s
Wall time: 9.01 s


In [None]:
%%time
averaged_model1 = AveragedModels([gbr, lasso_model, xgb_model, lgb_model])
get_cv_scores(averaged_model1, X, y_train);

Root mean squared error: 0.120 (0.009)
CPU times: user 10.8 s, sys: 1.2 s, total: 12 s
Wall time: 10.6 s


In [None]:
%%time
weighted_average_model = WeightedAveragedModels([gbr, lasso_model, xgb_model], [0.3, 0.3, 0.4])
get_cv_scores(weighted_average_model, X, y_train);

Root mean squared error: 0.120 (0.009)
CPU times: user 9.12 s, sys: 1.18 s, total: 10.3 s
Wall time: 8.94 s


In [None]:
%%time
weighted_average_model = WeightedAveragedModels([gbr, lasso_model, xgb_model], [0.3, 0.45, 0.25])
get_cv_scores(weighted_average_model, X, y_train);

Root mean squared error: 0.118 (0.010)
CPU times: user 8.99 s, sys: 1.21 s, total: 10.2 s
Wall time: 8.89 s


In [None]:
%%time
bagging_model = BaggingModels([gbr, lasso_model, xgb_model])
get_cv_scores(bagging_model, X, y_train);

Root mean squared error: 0.153 (0.015)
CPU times: user 7.98 s, sys: 1.35 s, total: 9.34 s
Wall time: 7.76 s


In [None]:
%%time
bagging_model = BaggingModels([gbr, lasso_model, xgb_model]*2)
get_cv_scores(bagging_model, X, y_train);

Root mean squared error: 0.145 (0.014)
CPU times: user 15.4 s, sys: 2.1 s, total: 17.5 s
Wall time: 15 s


In [None]:
%%time
bagging_model = BaggingModels([gbr, lasso_model, xgb_model]*3)
get_cv_scores(bagging_model, X, y_train);

Root mean squared error: 0.142 (0.005)
CPU times: user 23.2 s, sys: 2.89 s, total: 26.1 s
Wall time: 22.7 s


In [None]:
%%time
stacking_model1 = StackingAveragedModels([gbr, lgb_model, xgb_model], lasso_model)
get_cv_scores(stacking_model1, X, y_train);

Root mean squared error: 0.125 (0.008)
CPU times: user 39.5 s, sys: 314 ms, total: 39.8 s
Wall time: 39.7 s


In [None]:
%%time
stacking_model2= StackingAveragedModels([gbr, lgb_model, xgb_model], lasso_model, use_features_in_secondary=True)
get_cv_scores(stacking_model2, X, y_train);

Root mean squared error: 0.120 (0.014)
CPU times: user 40.4 s, sys: 1.23 s, total: 41.7 s
Wall time: 40.5 s


In [None]:
%%time
stacking_model3 = StackingAveragedModels([gbr, lgb_model, xgb_model, lasso_model], lasso_model)
get_cv_scores(stacking_model3, X, y_train);

Root mean squared error: 0.119 (0.014)
CPU times: user 43.9 s, sys: 4.24 s, total: 48.1 s
Wall time: 43.8 s


In [None]:
%%time
stacking_model4 = StackingAveragedModels([gbr, lgb_model, xgb_model, lasso_model], lasso_model, use_features_in_secondary=True)
get_cv_scores(stacking_model4, X, y_train);

Root mean squared error: 0.121 (0.014)
CPU times: user 45 s, sys: 4.97 s, total: 50 s
Wall time: 44.9 s


In [None]:
%%time
blending_model1 = BlendingAveragedModels([gbr, lgb_model, xgb_model, lasso_model], lasso_model)
get_cv_scores(blending_model1, X, y_train);

Root mean squared error: 0.124 (0.015)
CPU times: user 8.97 s, sys: 902 ms, total: 9.87 s
Wall time: 8.91 s


In [None]:
%%time
blending_model2 = BlendingAveragedModels([gbr, lgb_model, xgb_model, lasso_model], lasso_model, use_features_in_secondary=True)
get_cv_scores(blending_model2, X, y_train);

Root mean squared error: 0.141 (0.003)
CPU times: user 9.36 s, sys: 1.24 s, total: 10.6 s
Wall time: 9.28 s


In [None]:
%%time
stacking_retrained_model1 = StackingModelsRetrained([gbr, lgb_model, xgb_model], lasso_model)
get_cv_scores(stacking_retrained_model1, X, y_train);

Root mean squared error: 0.125 (0.008)
CPU times: user 49.1 s, sys: 412 ms, total: 49.6 s
Wall time: 49.5 s


In [None]:
%%time
stacking_retrained_model2 = StackingModelsRetrained([gbr, lgb_model, xgb_model], lasso_model, use_features_in_secondary=True)
get_cv_scores(stacking_retrained_model2, X, y_train);

Root mean squared error: 0.121 (0.014)
CPU times: user 50 s, sys: 1.53 s, total: 51.5 s
Wall time: 50 s


In [None]:
%%time
stacking_retrained_model3 = StackingModelsRetrained([gbr, lgb_model, xgb_model, lasso_model], lasso_model)
get_cv_scores(stacking_retrained_model1, X, y_train);

Root mean squared error: 0.125 (0.008)
CPU times: user 48.8 s, sys: 373 ms, total: 49.2 s
Wall time: 49 s


In [None]:
%%time
stacking_retrained_model4 = StackingModelsRetrained([gbr, lgb_model, xgb_model, lasso_model], lasso_model, use_features_in_secondary=True)
get_cv_scores(stacking_retrained_model2, X, y_train);

Root mean squared error: 0.121 (0.014)
CPU times: user 50.2 s, sys: 1.55 s, total: 51.8 s
Wall time: 50.3 s
