In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import scipy.stats as ss
from scipy.stats import binom, norm
import math

In [2]:
# Load the Iris dataset
iris = load_iris()
iris = iris
X = iris.data
y = iris.target

X = X[y != 0]
y = y[y != 0]

# Split into training and testing sets
X_train, X_test, y_train, y_test = \
    train_test_split(X,
                        y, 
                        test_size=0.5, 
                        random_state=42)

In [3]:
def checkRFClassifier(max_depth):
    output = {
        'predictions': {
            'raw': [],
            'corr': []
        },
        'accuracy': {
            'raw': []
        }
    }

    for i in range(100):
        # Train a Random Forest with a single tree
        clf = RandomForestClassifier(
                    max_depth=max_depth,
                    n_estimators=1, 
                    random_state=i
                )
        clf.fit(X_train, y_train)

        # Make predictions
        y_pred = clf.predict(X_test)
        for pred in output['predictions']['raw']:
            if np.all(y_pred == y_pred[0]):
                break
            elif np.all(pred == pred[0]):
                continue
            output['predictions']['corr'].append(ss.pearsonr(pred, y_pred)[0])
        
        output['predictions']['raw'].append(y_pred)

        # Calculate accuracy
        acc = accuracy_score(y_test, y_pred)
        output['accuracy']['raw'].append(acc)
        del clf
    
    output['accuracy']['mean'] = np.mean(output['accuracy']['raw'])
    output['accuracy']['std'] = np.std(output['accuracy']['raw'])
    return output

In [4]:
def probAnalysis(p, target, max_depth):
    N_approx = math.ceil(
                    2 * (norm.ppf(target) / (2 * p - 1)) ** 2
                )
    
    N_actual = 1
    temp = 0
    while temp < target:
        clf = RandomForestClassifier(
                    max_depth=max_depth,
                    n_estimators=N_actual, 
                    random_state=42
                )
        clf.fit(X_train, y_train)

        # Make predictions
        y_pred = clf.predict(X_test)
        temp = accuracy_score(y_true=y_test, y_pred=y_pred)
        N_actual += 2
    
    return N_actual, N_approx

In [5]:
for max_depth in range(1, 10 + 1):
    results = checkRFClassifier(max_depth=max_depth)
    N = probAnalysis(p=results['accuracy']['mean'],
                        target=0.8,
                        max_depth=max_depth)
    print(F"Max Depth: {max_depth}")
    print(f"- Expected Learners: {N[1]}")
    print(f"- Actual Learners  : {N[0]}")

Max Depth: 1
- Expected Learners: 4
- Actual Learners  : 3
Max Depth: 2
- Expected Learners: 3
- Actual Learners  : 3
Max Depth: 3
- Expected Learners: 3
- Actual Learners  : 3
Max Depth: 4
- Expected Learners: 3
- Actual Learners  : 3


KeyboardInterrupt: 