# Machine Learning Practice
## Module 12: Ensemble Methods

Andrew H. Fagg (andrewhfagg@gmail.com)

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve, auc
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

import pickle as pkl

# Default figure parameters
plt.rcParams['figure.figsize'] = (5,5)
plt.rcParams['font.size'] = 10
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['figure.constrained_layout.use'] = True
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def scatter_plot(ins, outs, ins2=None, outs2=None):
    '''
    Generate a scatter plot with points colored by a binary label
    
    :param ins: Nx2 matrix of 2D samples
    :param pred: binary labels for each sample (0 or 1)
    '''
    
    # Identify the labeled trues and falses
    elems_true = np.where(outs == 1)[0]
    elems_false = np.where(outs == 0)[0]
    
    # Generate the figure
    fig, ax = plt.subplots()
    ax.plot(ins[elems_true,0], ins[elems_true,1], 'r.')
    ax.plot(ins[elems_false,0], ins[elems_false,1], 'g.')
    
    if ins2 is not None:
        elems_true = np.where(outs2 == 1)[0]
        elems_false = np.where(outs2 == 0)[0]
        ax.plot(ins2[elems_true,0], ins2[elems_true,1], 'ro')
        ax.plot(ins2[elems_false,0], ins2[elems_false,1], 'go')
    fig.legend(['Positive', 'Negative'])
    plt.xlabel('x[0]')
    plt.ylabel('x[1]')

In [None]:
def plot_probs(outs, proba):
    '''
    Generate a cumulative distribution figure and an ROC figure
    
    :param outs: N-vector of true labels (0 or 1)
    :param proba: Nx2 matrix of predicted probabilities for class 1 and 0, respectively
    '''
    
    pred = proba[:,0] >= 0.5
    confusion = confusion_matrix(outs, pred)
    print("Confusion:", confusion)
    
    # Evaluate
    print("log loss: ", log_loss(outs, proba))
    
    # TPR/FPR plot
    fpr, tpr, thresholds = roc_curve(outs, proba[:,0])
    fig, ax = plt.subplots()
    ax.plot(thresholds, tpr, color='b')
    ax.plot(thresholds, fpr, color='r')
    ax.plot(thresholds, tpr - fpr, color='g')
    ax.invert_xaxis()
    ax.set_xlabel('threshold')
    ax.set_ylabel('fraction')
    ax.legend(['TPR', 'FPR', 'distance'])
    
    # ROC plot
    fig, ax = plt.subplots()
    ax.plot(fpr, tpr, color='b')
    ax.plot([0,1], [0,1], 'r--')
    ax.set_xlabel('FPR')
    ax.set_ylabel('TPR')
    ax.set_aspect('equal', 'box')
    print("AUC:", auc(fpr, tpr))

## Load Data

In [None]:
#fname = 'ensemble_data.pkl'
fname = '/content/drive/MyDrive/MLP_2022/datasets/ensemble_data.pkl'
fp = open(fname, 'rb')
ins = pkl.load(fp)
outs = pkl.load(fp)
fp.close()

In [None]:
# Split training from validation
N_training = 100
ins1 = ins[:N_training,:]
outs1 = outs[:N_training]
ins2 = ins[N_training:,:]
outs2 = outs[N_training:]
scatter_plot(ins1, outs1, ins2, outs2)

## Create Classifier Ensemble

## Individual classifiers

## Soft voting