# Stacking

Here we consider two types of stacking ensemble:
- one based on seven different model types
- one based on variants of a single model type (SVC)

Overall the options compared are:
- heterogenenous ensemble with 7 estimators, different model types
- same 7 heterogenous estimators but stacking
- single estimator as baseline
- stacked ensemble of 7 variations on a single model type
- same stacked ensemble but 20 estimators

The heterogenous ensemble uses `VotingClassifier`.  
The others use `StackingClassifier` both from `scikit-learn`.  
`StackingClassifier` uses logistic regression as the final estimator. 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from statistics import mode, mean
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, RepeatedKFold
from ensemble_functions import *
from random import choice

### Load Data

In [None]:
hotel_pd = pd.read_csv('HotelRevHelpfulness.csv')
hotel_pd.head()

In [None]:
hotel_pd.pop('hotelId').values
y = hotel_pd.pop('reviewHelpfulness').values
X = hotel_pd.values
X.shape

## Models Used
These are the models used in the heterogeneous ensemble and in stacking.

In [None]:
scaler = StandardScaler()
ann = make_pipeline(scaler, MLPClassifier(solver='lbfgs'))
lr = make_pipeline(scaler, LogisticRegression())
kNN = make_pipeline(scaler, KNeighborsClassifier(n_neighbors=3))
dtree = DecisionTreeClassifier(criterion='entropy')
gnb = make_pipeline(scaler, GaussianNB())
svc = make_pipeline(scaler, SVC())
qda = make_pipeline(scaler, QuadraticDiscriminantAnalysis())

# format required for sklearn stacking
estimator_tups = [('kNN',kNN),('dtree', dtree),('gnb',gnb),
                  ('ann',ann), ('lr',lr), ('SVC', svc), ('QDA', qda)]

## SVC Stacking
We evaluate stacked ensembles based on SVC variants.  
The variants are generated by randomly selecting hyper-parameters. 

In [None]:
def select_param_dict(pd_options):
    p_dict = {}
    for param in pd_options.keys():
        p_dict[param] = choice(pd_options[param])
    return p_dict

In [None]:
SVC_params = { "kernel":["rbf", "linear", "poly"],
            "C":[0.05,0.1,0.2],
            "gamma":[0.1, 0.5],
             }
SVC_params

Setting up three different types of stacked ensemble.

In [None]:
# Seems we can't use n_jobs = -1 as a parameter for StackingClassifier if we are
# using cross-validation for testing. Too many levels of x-val?
n_est = 20
SVC_ests = []
for i in range(n_est):
    SVC_ests.append(('Est'+str(i),
                 make_pipeline(scaler, 
                             SVC(**select_param_dict(SVC_params)))))

SVC_stack7 = StackingClassifier(estimators=SVC_ests[:7], cv = 10, 
                                passthrough = False,
                           final_estimator=LogisticRegression(max_iter = 5000))

SVC_stack20 = StackingClassifier(estimators=SVC_ests, cv = 10, 
                                 passthrough = False,
                           final_estimator=LogisticRegression(max_iter = 5000))

hetero_stack = StackingClassifier(estimators=estimator_tups, cv = 10,
                                  passthrough = False,
                           final_estimator=LogisticRegression(max_iter = 5000))

voting = VotingClassifier(estimators=estimator_tups)

svc = make_pipeline(scaler, SVC(**select_param_dict(SVC_params)))
svc = choice(SVC_ests)[1]

In [None]:
svc = choice(SVC_ests)[1]
svc

## X-Val testing
The actual testing using 20x10-fold cross validation.

In [None]:
models_dict = {}
models_dict['Hetero'] = voting
models_dict['Hetero stack'] = hetero_stack
models_dict['SVC'] = svc
models_dict['SVCstack 7'] = SVC_stack7
models_dict['SVCstack 20'] = SVC_stack20
scores_dict = {}

In [None]:
rkf = RepeatedKFold(n_splits=10, n_repeats=20)
for m in models_dict.keys():
    score = cross_val_score(models_dict[m],X,y, cv = rkf, n_jobs = -1)
    scores_dict[m]=mean(score)
    print(' %12s % 4.3f' % (m, mean(score)))

In [None]:
def simple_barchart(names, values, colours, title = ' ',
                    y_lab='', x_lab = '' , ymax = 1):
    y_pos = np.arange(len(names))
    fig = plt.figure(figsize=(6,4))
    plt.bar(y_pos, values, align='center', color = colours, alpha=0.5)
    plt.xticks(y_pos, names)
    plt.ylabel(y_lab)
    plt.xlabel(x_lab)
    plt.title(title)
    plt.ylim((0.5,ymax))
    plt.grid(axis = 'y')
    plt.show()
    return fig

In [None]:
model_names = scores_dict.keys()
model_acc = scores_dict.values()
clrs = ('g','g','b','r','r')
f = simple_barchart(model_names,model_acc,clrs, ymax = 0.75, title = 'Stacking ensemble accuracy',
                y_lab = 'Accuracy', x_lab = 'Model')
f.savefig('StackBars.pdf')