# Heterogeneous Ensembles
The obvious way to produce an ensemble of diverse classifiers is to use different model types as the base estimators. 
 1. We assess the performance of a classifier ensemble with 6 different estimators, *k*-NN, Logisitc Regression, D-Tree, Artificial Neural Net, Support Vector Classifier and Naive Bayes. 
 2. We measure diversity using the plain disagreement measure.
 3. We compare this with the performance of a Bagging ensemble with 6 members.  
 
 The evaluation is done with a single hold-out test.  
 The `plain_dis` and `get_consensus_prediction` functions are imported from `ensemble_functions.py`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

### Load Data

In [None]:
hotel_pd = pd.read_csv('HotelRevHelpfulness.csv')
hotel_pd.head()

In [None]:
hotel_pd.pop('hotelId').values
y = hotel_pd.pop('reviewHelpfulness').values
X = hotel_pd.values
X.shape

## Heterogenous Ensemble

In [None]:
from statistics import mode
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from ensemble_functions import *

Data needs to be scaled for most of the models.

In [None]:
scaler = StandardScaler()
ann = make_pipeline(scaler, MLPClassifier(solver='lbfgs'))
lr = make_pipeline(scaler, LogisticRegression())
kNN = make_pipeline(scaler, KNeighborsClassifier(n_neighbors=3))
dtree = DecisionTreeClassifier(criterion='entropy')
gnb = make_pipeline(scaler, GaussianNB())
svc = make_pipeline(scaler, SVC())

Generate estimator predictions and store in a dataframe

In [None]:
estims = {'k-NN':kNN, 'Tree':dtree, 'Naive Bayes': 
          gnb,'ANN': ann, 'Logistic': lr, 'SVC': svc}

res_df = pd.DataFrame()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 2)

In [None]:
for est in estims.keys():
    clf = estims[est].fit(X_train,y_train)
    y_preds = clf.predict(X_test)
    res_df[est]=y_preds
res_df.head()

A function to return a count of the entries in lists `l1` and `l2` that are not equal.  
This is the *plain disagreement* measure to guantify ensemble diversity. 

In [None]:
print('%4.3f' % (plain_dis(res_df['k-NN'],res_df['Tree'])))
print('%4.3f' % (plain_dis(res_df['Naive Bayes'],res_df['Tree'])))
print('%4.3f' % (plain_dis(res_df['Naive Bayes'],res_df['k-NN'])))

Calculate the plain disagreement scores for the 6 estimators.

In [None]:
models = estims.keys()
N = len(models)
plain_dis_arr = np.zeros((N,N))

for i_ind,i in enumerate(models):
    for j_ind,j in enumerate(models):
        plain_dis_arr[i_ind,j_ind] = plain_dis(res_df[i],res_df[j])
plain_dis_arr

In [None]:
plain_dis_arr.max()

### Heterogeneous Ensemble Predictions

In [None]:
res_df['Actual'] = y_test
res_df['Consensus'] = get_consensus_prediction(res_df,res_df.columns)
res_df

In [None]:
model_acc = []
for m in models:
    acc = accuracy_score(res_df['Actual'],res_df[m])
    model_acc.append(acc)
    print(m, '%4.3f' % acc)
    
c_acc = accuracy_score(res_df['Actual'],res_df['Consensus'])
model_acc.append(c_acc)
print('Consensus %4.3f' % c_acc)

## Bagging
Ensembles based on Bagging. 
- 10 ensemble members are trained using bootstrap resampling

We don't need to worry about scaling here because the base estimator is a tree. 

In [None]:
tree_bag = BaggingClassifier(dtree, 
                            n_estimators = 6,
                            max_samples = 1.0, # bootstrap resampling 
                            bootstrap = True)

The 6 members of the bagging ensemble trained on bootstrap samples from `X_train`

In [None]:
bag_ests = tree_bag.fit(X_train,y_train).estimators_
bag_df = pd.DataFrame()

In [None]:
for i, est in enumerate(bag_ests):
    y_preds = est.predict(X_test)
    bag_df['Est '+ str(i+1)]=y_preds

In [None]:
bag_df['Actual'] = y_test
bag_df['Consensus'] = get_consensus_prediction(bag_df,bag_df.columns)
bag_df.head()

In [None]:
N = 6
bag_names = []
for i in range(N):
    bag_names.append('Est '+ str(i+1))

bag_dis_arr = np.zeros((N,N))

for i_ind,i in enumerate(bag_names):
    for j_ind,j in enumerate(bag_names):
        bag_dis_arr[i_ind,j_ind] = plain_dis(bag_df[i],bag_df[j])
bag_dis_arr

In [None]:
print('Max difference in Hetero Ensemble: %4.3f' % (plain_dis_arr.max()))
print('Max difference in Bagged Ensemble: %4.3f' % (bag_dis_arr.max()))
maxv = max(bag_dis_arr.max(),plain_dis_arr.max())
print('Overall max is: %4.3f' % maxv)

In [None]:
bag_est_acc =[]
for i in range(6):
    ms = 'Est '+ str(i+1)
    acc = accuracy_score(bag_df['Actual'],bag_df[ms])
    print(ms, acc)
    bag_est_acc.append(acc)
bag_acc = accuracy_score(bag_df['Actual'],bag_df['Consensus'])
print('Consensus', bag_acc)
bag_est_acc.append(bag_acc)

## Plotting results
We look at the accuracies and the disagreement (diversity) among the ensemble members.   
We see that the diversity with bagging is just as good as with the heterogenous ensemble. 

In [None]:
# Function to plot colourmaps of the disagreement matrices.

def do_colourmap (matrix, names, title = ' ', **kwargs):
    if 'vmx' in kwargs:    # check that 'reps' is a keyword
        vmax_val = kwargs['vmx']
    else: vmax_val = matrix.max()

    fig, ax = plt.subplots(figsize=(5,5))
    im = ax.imshow(matrix, cmap = 'gray', vmax= vmax_val)
    # We want to show all ticks...
    ax.set_xticks(np.arange(len(names)))
    ax.set_yticks(np.arange(len(names)))
    # ... and label them with the respective list entries
    ax.set_xticklabels(names)
    ax.set_yticklabels(names)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(names)):
        for j in range(len(names)):
            text = ax.text(j, i, round(matrix[i, j],2),
                           ha="center", va="center", color="r")
    ax.set_title(title)
    fig.tight_layout()
    plt.show()
    return fig

`maxv` is the largest value in both arrays so colourmaps match. 

In [None]:
model_names = list(models)
f = do_colourmap(plain_dis_arr, model_names, 
             title = 'Heterogeneous ensemble disagreement', vmx = maxv)
f.savefig('HeteroCM.pdf')

In [None]:
len(bag_names), bag_dis_arr.shape

In [None]:
f = do_colourmap(bag_dis_arr, bag_names, 
             title = 'Bagging ensemble disagreement', vmx = maxv)
f.savefig('BagCM.pdf')

In [None]:
def simple_barchart(names, values, colours, title = ' ',
                    y_lab='', x_lab = '' , ymax = 1):
    y_pos = np.arange(len(names))
    fig = plt.figure(figsize=(6,4))
    plt.bar(y_pos, values, align='center', color = colours, alpha=0.5)
    plt.xticks(y_pos, names)
    plt.ylabel(y_lab)
    plt.xlabel(x_lab)
    plt.title(title)
    plt.ylim((0,ymax))
    plt.grid(axis = 'y')
    plt.show()
    return fig

In [None]:
model_names = list(models)
model_names.append('Ensemble')
clrs = ('b','b','b','b','b','b','r')
f = simple_barchart(model_names,model_acc,clrs, ymax = 0.8, title = 'Heterogenous ensemble accuracy',
                y_lab = 'Accuracy', x_lab = 'Model')
f.savefig('HeteroBars.pdf')

In [None]:
clrs = ('b','b','b','b','b','b','r')
f = simple_barchart(bag_names +['Ensemble'],bag_est_acc,clrs, ymax = 0.8,
                title = 'Bagging ensemble accuracy',
                y_lab = 'Accuracy', x_lab = 'Model')
f.savefig('BagBars.pdf')