In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import norm
import numpy as np

# Import Data

We start by importing the Iris dataset. 

Since we are representing our single classifier as a Bernoulli trial, this is a binary classification problem. The Iris dataset has three labels, so we drop the data that is labelled $0$.

In [5]:
iris = load_iris()
iris = iris
X = iris.data
y = iris.target

X = X[y != 0]
y = y[y != 0]

We then split it into training and testing sets of equal size.

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(X,
                        y, 
                        test_size=0.5, 
                        random_state=42)

# Approximations

In [3]:
def approx_learner_dist(max_depth):
    accuracies = []
    for i in range(100):
        # Train a Random Forest with a single tree
        clf = RandomForestClassifier(
                    max_depth=max_depth,
                    n_estimators=1, 
                    random_state=i
                )
        clf.fit(X_train, y_train)

        # Make predictions
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accuracy_score.append(acc)
        del clf
    
    return np.array(accuracies)

In [4]:
def find_actual_number(target, max_depth):
    N_actual = 1
    temp = 0
    while temp < target:
        clf = RandomForestClassifier(
                    max_depth=max_depth,
                    n_estimators=N_actual, 
                    random_state=42
                )
        clf.fit(X_train, y_train)

        # Make predictions
        y_pred = clf.predict(X_test)
        temp = accuracy_score(y_true=y_test, y_pred=y_pred)
        N_actual += 2
        del clf
    
    return N_actual


def approximate(mu_p, sigma_p, N, approx, type=None):
    def binom_approx(mu_p, N, type):
        prob = None
        if type == 'small':
            prob = norm.cdf((2 * mu_p - 1) * ((N + 1) / 2) ** 2)
        elif type == 'large':
            prob = norm.cdf((N * (2 * mu_p - 1) + 1) / (2 * (N * mu_p * (1 - mu_p)) ** 0.5))
        
        return prob

    def poisson_approx(mu_p, N):
        prob = norm.cdf((2 * mu_p - 1) * ((N + 1) / 2) ** 2)
        return prob

    def normal_approx(mu_p, sigma_p, N):
        prob = norm.cdf((N * (2 * mu_p - 1) + 1) / (2 * (N * (mu_p - sigma_p^2 - mu_p^2)) ** 0.5))
        return prob
    
    if approx == 'binomial':
        return binom_approx(mu_p=mu_p,
                            N=N,
                            type=type)
    elif approx == 'poisson':
        return poisson_approx(mu_p=mu_p,
                                N=N)
    elif approx == 'normal':
        return normal_approx(mu_p=mu_p,
                                sigma_p=sigma_p,
                                N=N)
    return None