# Learning with Ensembles

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import math
from scipy.special import comb

def ensemble_error(n_classifier, error):
    k_start = int(math.ceil(n_classifier / 2.))
    probs = [comb(n_classifier, k) * 
             error**k * 
             (1-error)**(n_classifier - k) 
             for k in range(k_start, n_classifier + 1)]
    return sum(probs)

ensemble_error(n_classifier=11, error=0.25)

In [None]:
# create range of base errors
error_range = np.arange(0.0, 1.01, 0.01)
# calculate ensemble errors
ens_errors = [ensemble_error(n_classifier=11, error=error) for error in error_range]

In [None]:
# plot
plt.figure(figsize=(10, 10))
plt.plot(error_range, ens_errors, label='Ensemble error', linewidth=2)
plt.plot(error_range, error_range, linestyle='--', label='Base error', linewidth=2)
plt.xlabel('Base error')
plt.ylabel('Base/Ensemble error')
plt.legend(loc='upper left')
plt.grid(alpha=0.5)
plt.show()

# Combining via Majority Vote

In [None]:
# find majority class
np.argmax(np.bincount([0, 0, 1], weights=[0.2, 0.2, 0.6]))

In [None]:
# calculate class probabilities
ex = np.array([[0.9, 0.1],
               [0.8, 0.2],
               [0.4, 0.6]])
p = np.average(ex, axis=0, weights=[0.2, 0.2, 0.6])
print(p)

# find majority class
np.argmax(p)

In [None]:
from majority_vote_classifier import MajorityVoteClassifier

from sklearn import datasets

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 

In [None]:
# prepare data
iris = datasets.load_iris()
X, y = iris.data[50:, [1, 2]], iris.target[50:]
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.5, 
    random_state=1,
    stratify=y
)

In [None]:
# initialize models
clf1 = LogisticRegression(penalty='l2', C=0.001, random_state=1, solver='liblinear')
clf2 = DecisionTreeClassifier(max_depth=1, criterion='entropy', random_state=0)
clf3 = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski')

# create pipelines for logreg and knn models
pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]])

# for each model, perform 10-fold cross-validation and calculate performance
clf_labels = ['Logistic regression', 'Decision tree', 'KNN']
print('10-fold cross validation:\n')
for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc')
    print(f"ROC AUC: {scores.mean():.2f} (+/- {scores.std():.2f}) [{label}]")

In [None]:
# combine the three classifiers
mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])

# for each model, perform 10-fold cross-validation and calculate performance 
clf_labels += ['Majority voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]
for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc')
    print(f"Accuracy: {scores.mean():.2f} (+/- {scores.std():.2f}) [{label}]")

In [None]:
# set linestyles
colors = ['black', 'orange', 'blue', 'green']
linestyles = [':', '--', '-.', '-']

# for each model ...
plt.figure(figsize=(10, 10))
for clf, label, clr, ls in zip(all_clf, clf_labels, colors, linestyles):
    # ... make predictions assuming the label of the positive class is 1
    y_pred = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
    # ... calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred)
    # ... calculate area under curve
    roc_auc = auc(x=fpr, y=tpr)
    # ... plot curve
    plt.plot(fpr, tpr, color=clr, linestyle=ls, label=f'{label} (auc = {roc_auc:.2f})')

    # plot random guessing
plt.plot([0, 1], [0, 1], linestyle='--', color='r', linewidth=2, alpha=0.5)

#style plot
plt.grid(alpha=0.5)
plt.legend(loc='lower right')
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')
plt.show()

In [None]:
from itertools import product

# initialize and fit scaler
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)

# plot decision regions:
# 1. create grid arrays
x_min = X_train_std[:, 0].min() - 1
x_max = X_train_std[:, 0].max() + 1
y_min = X_train_std[:, 1].min() - 1
y_max = X_train_std[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))

# 2. for each model ...
f, axarr = plt.subplots(nrows=2, ncols=2, sharex='col', sharey='row', figsize=(10, 10))
for idx, clf, tt in zip(product([0, 1], [0, 1]), all_clf, clf_labels):
    # ... fit scaled data
    clf.fit(X_train_std, y_train)
    # ... make predictions
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # ... plot decision surface
    axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.3)
    # ... plot class samples
    axarr[idx[0], idx[1]].scatter(X_train_std[y_train == 0, 0], X_train_std[y_train == 0, 1],
                                  c='blue', marker='^', s=50)
    axarr[idx[0], idx[1]].scatter(X_train_std[y_train == 1, 0], X_train_std[y_train == 1, 1],
                                  c='green', marker='o', s=50)
    axarr[idx[0], idx[1]].set_title(tt)

# label plot axes
plt.text(-3.5, -4.5, s='Sepal width [standardized]',
         ha='center', va='center', fontsize=12)
plt.text(-12.5, 4.5, s='Petal length [standardized]',
         ha='center', va='center', fontsize=12, rotation=90)
plt.show()

In [None]:
mv_clf.get_params()

In [None]:
# set values for max depth of decision tree model and C of logreg model
params = {'decisiontreeclassifier__max_depth': [1, 2],
          'pipeline-1__clf__C': [0.001, 0.1, 100.0]}

# initialize and fit gridsearch
grid = GridSearchCV(estimator=mv_clf, param_grid=params, iid=True, cv=10, scoring='roc_auc')
grid.fit(X_train, y_train)

In [None]:
# for each model, print average score and standard deviation
params = grid.cv_results_['params']
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']

for param, mean, std in zip(params, means, stds):
    print(f"{mean:.3f} +/- {std / 2:.2f} {param}")

# find best parameters
print(f'Best parameters: {grid.best_params_}')

# find best accuracy
print(f'Accuracy: {grid.best_score_:.3f}')

# Bagging - Building Ensembles with Bootstrap Samples

In [None]:
# loadwine dataset as pandas DataFrame
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/'
                      'machine-learning-databases/wine/wine.data',
                      header=None)
df_wine.columns = [
    'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium',
    'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 
    'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'
]

# drop class 1
df_wine = df_wine[df_wine['Class label'] != 1]
y = df_wine['Class label'].values
X = df_wine[['Alcohol', 'OD280/OD315 of diluted wines']].values

In [None]:
# encode and split dataset
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1, stratify=y
)

In [None]:
from sklearn.ensemble import BaggingClassifier

# initialize decision tree model
tree = DecisionTreeClassifier(criterion='entropy', random_state=1, max_depth=None)
# initialize bagging classifier
bag = BaggingClassifier(
    base_estimator=tree, n_estimators=500, max_samples=1.0, max_features=1.0, 
    bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1
)

In [None]:
# fit decision tree
tree = tree.fit(X_train, y_train)
# make predictions
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
# calculate accuracy
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print(f'Decision tree train/test accuracies {tree_train:.3f}/{tree_test:.3f}')

In [None]:
# fit bagging classifier
bag = bag.fit(X_train, y_train)
# make predictions
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)
# calculate accuracy
bag_train = accuracy_score(y_train, y_train_pred) 
bag_test = accuracy_score(y_test, y_test_pred) 
print(f'Bagging train/test accuracies {bag_train:.3f}/{bag_test:.3f}')

In [None]:
# plot decision regions for both classifiers
x_min = X_train[:, 0].min() - 1
x_max = X_train[:, 0].max() + 1
y_min = X_train[:, 1].min() - 1
y_max = X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(nrows=1, ncols=2, sharex='col', sharey='row', figsize=(15, 8))
for idx, clf, tt in zip([0, 1], [tree, bag], ['Decision tree', 'Bagging']):
    clf.fit(X_train, y_train)
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    axarr[idx].contourf(xx, yy, Z, alpha=0.3)
    axarr[idx].scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1],
                       c='blue', marker='^')
    axarr[idx].scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1],
                       c='green', marker='o')
    axarr[idx].set_title(tt)
axarr[0].set_ylabel('Alcohol', fontsize=12)
plt.text(10.2, -1.2, s='OD280/OD315 of diluted wines',
         ha='center', va='center', fontsize=12)
plt.show()

# Leveraging Weak Learnings via Adaptive Boosting

# Applying AdaBoost Using `scikit-learn`

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# initialize decision tree
tree = DecisionTreeClassifier(
    criterion='entropy', random_state=1, max_depth=1
)
# initialize adaboost
ada = AdaBoostClassifier(
    base_estimator=tree, n_estimators=500, learning_rate=0.1, random_state=1
)

In [None]:
# fit decision tree
tree = tree.fit(X_train, y_train)
# make predictions
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
# calculate accuracy
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print(f'Decision tree train/test accuracies {tree_train:.3f}/{tree_test:.3f}')

In [None]:
# fit adaboost
ada = ada.fit(X_train, y_train)
# make predictions
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
# calculate accuracy
ada_train = accuracy_score(y_train, y_train_pred) 
ada_test = accuracy_score(y_test, y_test_pred) 
print(f'AdaBoost train/test accuracies {ada_train:.3f}/{ada_test:.3f}')

In [None]:
# plot decision regions for both classifiers
x_min = X_train[:, 0].min() - 1
x_max = X_train[:, 0].max() + 1
y_min = X_train[:, 1].min() - 1
y_max = X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15, 8))
for idx, clf, tt in zip([0, 1], [tree, ada], ['Decision Tree', 'AdaBoost']):
    clf.fit(X_train, y_train)   
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    axarr[idx].contourf(xx, yy, Z, alpha=0.3)
    axarr[idx].scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], 
                       c='blue', marker='^')
    axarr[idx].scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], 
                       c='red', marker='o')
    axarr[idx].set_title(tt)
    axarr[0].set_ylabel('Alcohol', fontsize=12)
plt.text(10.2, -0.5, s='OD280/OD315 of diluted wines', 
         ha='center', va='center', fontsize=12)    
plt.show()