In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np

We perform the entire process for using Random Forests with a maximum depth of $1$.

# Dataset Preparation

We start by importing the Iris dataset. 

Since we are representing our single classifier as a Bernoulli trial, this is a binary classification problem. The Iris dataset has three labels, so we drop the data that is labelled $0$.

In [2]:
iris = load_iris()
iris = iris
X = iris.data
y = iris.target

remove_label = 0

X = X[y != remove_label]
y = y[y != remove_label]
y = (y == np.max(y)).astype(int)

We then split it into training and testing sets of equal size.

In [3]:
X_train, X_test, y_train, y_test = \
    train_test_split(X,
                        y, 
                        test_size=0.5, 
                        random_state=42)
data = {
    'X_train': X_train, 
    'X_test': X_test, 
    'y_train': y_train, 
    'y_test': y_test
}

# Set Up Initial Classifiers

Before Equation $4.2$, we assumed $p_i$ are generated by a distribution with mean $\mu_p$ and variance $\sigma_p^2$.

Thus, we need to estimate this distribution. We do so by generating $100$ random forests, each of size $1$ and getting the accuracy of each using the `approx_learner_dist` function, from the `ensembleEstimation` class.

In [4]:
from ensembleEstimation import ensembleEstimation

ensemble = ensembleEstimation(1, data)
probs = ensemble.approx_learner_dist()

print(f'mu_p : {ensemble.mu_p:.2f}')
print(f'sigma_p : {ensemble.sigma_p:.2e}')

mu_p : 0.90
sigma_p : 4.44e-16


# Theoretical Approximation

Since, $\mu_p$ is fairly away from $0$ and $\sigma_p \to 0$, we need to use the binomial approximation, but for the sake of demonstration, we use all three approximations.

Suppose we use ensembles of size $11$, $21$, $\dots$, $51$ with both of these approximations and compare their performance. As $\mu_p$ is large, increasing the size of the ensemble will change the type of the binomial approximation we have to use.

We store the actual and predicted accuracies in a Pandas DataFrame for convenience.

In [5]:
Ns = []
actual_acc = []
pred_acc_bin = []
pred_acc_poi = []
pred_acc_nor = []

for N in range(11, 51 + 1, 10):
    Ns.append(N)
    actual_acc.append(ensemble.find_actual_accuracy(N))
    pred_acc_bin.append(ensemble.approximate(N, "binomial"))
    pred_acc_poi.append(ensemble.approximate(N, "poisson"))
    pred_acc_nor.append(ensemble.approximate(N, "normal"))

results = pd.DataFrame(
    {
        'N': Ns,
        'Act. Acc': actual_acc,
        'Bin. App.': pred_acc_bin,
        'Pois. App.': pred_acc_poi,
        'Norm. App.': pred_acc_nor,
    }
)
results['RE_B_p'] = \
        (results['Bin. App.'] \
            - results['Act. Acc']) \
        /results['Act. Acc'] * 100
results['RE_P_p'] = \
        (results['Pois. App.'] \
            - results['Act. Acc']) \
        /results['Act. Acc'] * 100
results['RE_N_p'] = \
        (results['Norm. App.'] \
            - results['Act. Acc']) \
        /results['Act. Acc'] * 100
results

Unnamed: 0,N,Act. Acc,Bin. App.,Pois. App.,Norm. App.,RE_B_p,RE_P_p,RE_N_p
0,11,0.88,0.987489,0.974978,1.0,12.214649,10.792982,13.636316
1,21,0.94,0.998007,0.996015,1.0,6.170987,5.958995,6.382979
2,31,0.94,0.999656,0.999313,1.0,6.346429,6.309879,6.382979
3,41,0.94,0.999938,0.999877,1.0,6.376428,6.369877,6.382979
4,51,0.94,0.999989,0.999977,1.0,6.381777,6.380575,6.382979


In [6]:
results.to_excel('Paper/tables/table-uneq-prob-demo.xlsx', index=False)
del results, ensemble