In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Import Data

We start by importing the Iris dataset. 

Since we are representing our single classifier as a Bernoulli trial, this is a binary classification problem. The Iris dataset has three labels, so we drop the data that is labelled $0$.

In [2]:
iris = load_iris()
iris = iris
X = iris.data
y = iris.target

X = X[y != 0]
y = y[y != 0]

We then split it into training and testing sets of equal size.

In [3]:
X_train, X_test, y_train, y_test = \
    train_test_split(X,
                        y, 
                        test_size=0.5, 
                        random_state=42)
data = {
    'X_train': X_train, 
    'X_test': X_test, 
    'y_train': y_train, 
    'y_test': y_test
}

# The Process

## Classifiers of Max Depth $1$

We perform the entire process for Random Forests with a maximum depth of $1$

Before Equation $4.2$, we assumed $p_i$ are generated by a distribution with mean $\mu_p$ and variance $\sigma_p^2$.

Thus, we need to estimate this distribution. We do so by generating $100$ random forests, each of size $1$ and getting the accuracy of each using the `approx_learner_dist` function, from the `ensembleEstimation` class.

In [4]:
from ensembleEstimation import ensembleEstimation

ensemble = ensembleEstimation(1, data)
probs = ensemble.approx_learner_dist()

print(f'mu_p : {ensemble.mu_p}')
print(f'sigma_p : {ensemble.sigma_p}')

mu_p : 0.9399999999999996
sigma_p : 3.3306690738754696e-16


Since, $\mu_p$ is fairly away from $0$ and $\sigma_p \to 0$, we need to use the binomial approximation. Suppose we use ensembles of size $11$, $21$, $\dots$, $51$ with both of these approximations and compare their performance. As $\mu_p$ is large, increasing the size of the ensemble will change the type of the binomial approximation we have to use.

We store the actual and predicted accuracies in a Pandas DataFrame for convenience.

In [5]:
Ns = []
actual_acc = []
pred_acc_bs = []
pred_acc_bl = []

for N in range(11, 51 + 1, 10):
    Ns.append(N)
    actual_acc.append(ensemble.find_actual_accuracy(N))
    pred_acc_bs.append(ensemble.approximate(N, "binomial", 'small'))
    pred_acc_bl.append(ensemble.approximate(N, "binomial", 'large'))


results = pd.DataFrame(
    {
        'N': Ns,
        'Act. Acc': actual_acc,
        'Bin. (S) App.': pred_acc_bs,
        'Bin. (L) App.': pred_acc_bl,
    }
)
results['RE (BS)'] = \
        (results['Bin. (S) App.'] \
            - results['Act. Acc']) \
        /results['Act. Acc'] * 100
results['RE (BL)'] = \
        (results['Bin. (L) App.'] \
            - results['Act. Acc']) \
        /results['Act. Acc'] * 100

results

Unnamed: 0,N,Act. Acc,Bin. (S) App.,Bin. (L) App.,RE (BS),RE (BL)
0,11,0.92,1.0,1.0,8.695652,8.695652
1,21,0.94,1.0,1.0,6.382979,6.382979
2,31,0.94,1.0,1.0,6.382979,6.382979
3,41,0.94,1.0,1.0,6.382979,6.382979
4,51,0.94,1.0,1.0,6.382979,6.382979


In [6]:
results[['RE (BS)', 'RE (BL)']].describe()

Unnamed: 0,RE (BS),RE (BL)
count,5.0,5.0
mean,6.845513,6.845513
std,1.034259,1.034259
min,6.382979,6.382979
25%,6.382979,6.382979
50%,6.382979,6.382979
75%,6.382979,6.382979
max,8.695652,8.695652


In [7]:
results.to_excel('Paper/tables/Ensemble - Size 1.xlsx', index=False)
del results, ensemble

## Classifiers of Max Depth $2$

We repear the entire process for Random Forests with a maximum depth of $2$

In [8]:
from ensembleEstimation import ensembleEstimation

ensemble = ensembleEstimation(10, data)
probs = ensemble.approx_learner_dist()

print(f'mu_p : {ensemble.mu_p}')
print(f'sigma_p : {ensemble.sigma_p}')

mu_p : 0.9200000000000002
sigma_p : 1.1102230246251565e-16


Since, $\mu_p$ is fairly away from $0$ and $\sigma_p \to 0$, we need to use the binomial approximation. Suppose we use ensembles of size $11$, $21$, $\dots$, $51$ with both of these approximations and compare their performance. As $\mu_p$ is large, increasing the size of the ensemble will change the type of the binomial approximation we have to use.

We store the actual and predicted accuracies in a Pandas DataFrame for convenience.

In [9]:
Ns = []
actual_acc = []
pred_acc_bs = []
pred_acc_bl = []

for N in range(11, 51 + 1, 10):
    Ns.append(N)
    actual_acc.append(ensemble.find_actual_accuracy(N))
    pred_acc_bs.append(ensemble.approximate(N, "binomial", 'small'))
    pred_acc_bl.append(ensemble.approximate(N, "binomial", 'large'))


results = pd.DataFrame(
    {
        'N': Ns,
        'Act. Acc': actual_acc,
        'Bin. (S) App.': pred_acc_bs,
        'Bin. (L) App.': pred_acc_bl,
    }
)
results['RE (BS)'] = \
        (results['Bin. (S) App.'] \
            - results['Act. Acc']) \
        /results['Act. Acc'] * 100
results['RE (BL)'] = \
        (results['Bin. (L) App.'] \
            - results['Act. Acc']) \
        /results['Act. Acc'] * 100

results

Unnamed: 0,N,Act. Acc,Bin. (S) App.,Bin. (L) App.,RE (BS),RE (BL)
0,11,0.92,1.0,1.0,8.695652,8.695651
1,21,0.92,1.0,1.0,8.695652,8.695652
2,31,0.9,1.0,1.0,11.111111,11.111111
3,41,0.88,1.0,1.0,13.636364,13.636364
4,51,0.92,1.0,1.0,8.695652,8.695652


In [10]:
results[['RE (BS)', 'RE (BL)']].describe()

Unnamed: 0,RE (BS),RE (BL)
count,5.0,5.0
mean,10.166886,10.166886
std,2.203544,2.203544
min,8.695652,8.695651
25%,8.695652,8.695652
50%,8.695652,8.695652
75%,11.111111,11.111111
max,13.636364,13.636364


In [11]:
results.to_excel('Paper/tables/Ensemble - Size 2.xlsx', index=False)
del results, ensemble