# Imports

In [33]:
%load_ext autoreload
%autoreload 2

import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from sklearn.datasets import *

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import copy

from data_structures.tree_classifier import TreeClassifier
import utils.utils
import time

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Prepare Data

In [1]:
# Download all datasets from sklearn
for m in [fetch_olivetti_faces, fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing]:
    print(m)
    try:
        all_ = m()
        train = m(subset='train')
        test = m(subset='test')
    except:
        pass

<function fetch_olivetti_faces at 0x7fb910b9f940>
<function fetch_20newsgroups_vectorized at 0x7fb910b71e50>
<function fetch_lfw_people at 0x7fb910af15e0>
<function fetch_lfw_pairs at 0x7fb910af1700>
<function fetch_covtype at 0x7fb910a9be50>
<function fetch_rcv1 at 0x7fb9009f19d0>
<function fetch_kddcup99 at 0x7fb910af1040>
<function fetch_california_housing at 0x7fb9009f1820>


In [34]:
# Download the data from two categories
cats = ['alt.atheism', 'sci.space']
ng_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=cats)
ng_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=cats)


vectorizer = TfidfVectorizer()
trans = vectorizer.fit(ng_train.data)
train_vectors = vectorizer.transform(ng_train.data)
test_vectors = vectorizer.transform(ng_test.data)
print("Number of datapoints: ", len(ng_train.data))
print("Number of features: ", train_vectors.shape[1])
print("Balance: ", np.sum(ng_train.target) / len(ng_train.target)) # 55-45, roughly balanced

N_COMPONENTS=100
pca = PCA(n_components=N_COMPONENTS)
pca.fit(train_vectors.toarray())
pca_train_vecs = pca.transform(train_vectors.toarray())
pca_test_vecs = pca.transform(test_vectors.toarray())

Number of datapoints:  1073
Number of features:  18217
Balance:  0.5526561043802423


# Compare our implementation's accuracy to sklearn

In [7]:
dt = DecisionTreeClassifier(random_state=0)
dt.fit(pca_train_vecs,ng_train.target)
print("sklearn Decision Tree Accuracy:", np.mean(dt.predict(pca_test_vecs) == ng_test.target))

#cross_val_score(dt, pca_train_vecs, ng_train.target, cv=10).mean()

sklearn Decision Tree Accuracy: 0.7840112201963534


In [8]:
rf = RandomForestClassifier(random_state=0)
rf.fit(pca_train_vecs,ng_train.target)
print("sklearn Random Forest Accuracy:", np.mean(rf.predict(pca_test_vecs) == ng_test.target))

#cross_val_score(rf, pca_train_vecs, ng_train.target, cv=10).mean()

sklearn Random Forest Accuracy: 0.7966339410939691


In [9]:
classes_arr = np.unique(ng_train.target)
classes = utils.utils.class_to_idx(classes_arr)

In [10]:
tc = TreeClassifier(data=pca_train_vecs, labels=ng_train.target, max_depth=5, classes=classes, verbose=False)
start = time.time()
tc.fit()
end = time.time()
print("Train accuracy:", np.mean(tc.predict_batch(pca_train_vecs)[0] == ng_train.target))
print("Test accuracy:", np.mean(tc.predict_batch(pca_test_vecs)[0] == ng_test.target))
print("Num queries:", tc.num_queries)
print("Runtime:", end-start)

Train accuracy: 0.8704566635601119
Test accuracy: 0.7840112201963534
Num queries: 7435
Runtime: 7.624583005905151


In [11]:
tc = TreeClassifier(data=pca_train_vecs, labels=ng_train.target, max_depth=5, classes=classes, solver="EXACT", verbose=False)
start = time.time()
tc.fit()
end = time.time()
print("Train accuracy:", np.mean(tc.predict_batch(pca_train_vecs)[0] == ng_train.target))
print("Test accuracy:", np.mean(tc.predict_batch(pca_test_vecs)[0] == ng_test.target))
print("Num queries:", tc.num_queries)
print("Runtime:", end-start)

Train accuracy: 0.8713886300093197
Test accuracy: 0.7840112201963534
Num queries: 5294
Runtime: 4.370760202407837


# Make the dataset huge

In [12]:
doublings = 4
pca_train_vecs_huge = copy.deepcopy(pca_train_vecs)
pca_train_labels_huge = copy.deepcopy(ng_train.target)
print(pca_train_vecs_huge.shape)
for i in range(doublings):
    pca_train_vecs_huge = np.concatenate((pca_train_vecs_huge, pca_train_vecs_huge))
    pca_train_labels_huge = np.concatenate((pca_train_labels_huge, pca_train_labels_huge))
print(pca_train_vecs_huge.shape)

(1073, 100)
(17168, 100)


In [13]:
tc = TreeClassifier(data=pca_train_vecs_huge, labels=pca_train_labels_huge, max_depth=2, classes=classes, verbose=True, random_state=0)
start = time.time()
tc.fit()
end = time.time()
print("Train accuracy:", np.mean(tc.predict_batch(pca_train_vecs)[0] == ng_train.target))
print("Test accuracy:", np.mean(tc.predict_batch(pca_test_vecs)[0] == ng_test.target))
print("Num queries:", tc.num_queries)
print("Runtime:", end-start)
tc.tree_print()

Calculated split with 1900 queries
Calculated split with 11428 queries
Calculated split with 22840 queries
Fitting finished
Train accuracy: 0.8070829450139795
Test accuracy: 0.761570827489481
Num queries: 36168
Runtime: 9.714073181152344
|--- feature_1 <= -0.037112264624637015
|   |--- feature_3 <= 0.11134642234670764
|   |   |--- class: 0
|   |--- feature_3 > 0.11134642234670764
|   |   |--- class: 1
|--- feature_1 > -0.037112264624637015
|   |--- feature_3 <= -0.07697285483262639
|   |   |--- class: 0
|   |--- feature_3 > -0.07697285483262639
|   |   |--- class: 1




In [14]:
# We should implement vanilla TreeClassifier --> which uses identity bins

tc = TreeClassifier(data=pca_train_vecs_huge, labels=pca_train_labels_huge, max_depth=2, classes=classes, solver="EXACT", verbose=True, random_state=0)
start = time.time()
tc.fit()
end = time.time()
print("Train accuracy:", np.mean(tc.predict_batch(pca_train_vecs)[0] == ng_train.target))
print("Test accuracy:", np.mean(tc.predict_batch(pca_test_vecs)[0] == ng_test.target))
print("Num queries:", tc.num_queries)
print("Runtime:", end-start)
tc.tree_print()

Calculated split with 17168 queries
Calculated split with 5728 queries
Calculated split with 11440 queries
Fitting finished
Train accuracy: 0.8070829450139795
Test accuracy: 0.761570827489481
Num queries: 34336
Runtime: 18.33877205848694
|--- feature_1 <= -0.037112264624637015
|   |--- feature_3 <= 0.11134642234670764
|   |   |--- class: 0
|   |--- feature_3 > 0.11134642234670764
|   |   |--- class: 1
|--- feature_1 > -0.037112264624637015
|   |--- feature_3 <= -0.07697285483262639
|   |   |--- class: 0
|   |--- feature_3 > -0.07697285483262639
|   |   |--- class: 1




# Verify our implementation of RFC agrees with sklearn's

# Classification:

In [68]:
from data_structures.wrappers.random_forest_classifier import RandomForestClassifier as RFC_ours
from data_structures.wrappers.extremely_random_forest_classifier import ExtremelyRandomForestClassifier as ERFC_ours

from data_structures.wrappers.random_forest_regressor import RandomForestRegressor as RFR_ours
from data_structures.wrappers.extremely_random_forest_regressor import ExtremelyRandomForestRegressor as ERFR_ours

from sklearn.ensemble import RandomForestClassifier as RFC_sklearn
from sklearn.bensemble import ExtraTreesClassifier as ERFC_sklearn

from sklearn.ensemble import RandomForestRegressor as RFR_sklearn
from sklearn.ensemble import ExtraTreesRegressor as ERFR_sklearn

from utils.constants import GINI, BEST, EXACT, MSE

# TODO(@motiwari): Allow for gradient boosted comparisons as well
def compare_accuracies(
    compare: str = "RFC",
    train_data: np.ndarray = None,
    train_targets: np.ndarray = None,
    test_data: np.ndarray = None,
    test_targets: np.ndarray = None,
    num_seeds: int = 10,
) -> bool:
    our_train_accs = []
    our_test_accs = []
    their_train_accs = []
    their_test_accs = []
    for seed in range(num_seeds):
        # Ok to have n_jobs = -1 throughout?
        if compare == "RFC":
            our_model = RFC_ours(data=train_data, labels=train_labels, n_estimators=5, max_depth=5, min_samples_split=2, min_impurity_decrease=0, max_leaf_nodes=None, budget=None, criterion=GINI, splitter=BEST, solver=EXACT, random_state=seed, verbose=False)
            their_model = RFC_sklearn(n_estimators=5, criterion='gini', max_depth=5, min_samples_split=2, min_impurity_decrease=0, max_leaf_nodes=None, n_jobs=-1, random_state=seed, verbose=0)
        elif compare == "ERFC":
            our_model = ERFC_ours(data=train_data, labels=train_labels, n_estimators=5, max_depth=5, num_bins=None, min_samples_split=2, min_impurity_decrease=0, max_leaf_nodes=None, budget=None, criterion=GINI, splitter=BEST, solver=EXACT, random_state=seed, verbose=False)
            their_model = ERFC_sklearn(n_estimators=5, criterion='gini', max_depth=5, min_samples_split=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=False, n_jobs=-1, random_state=seed, verbose=0)
        elif compare == "RFR":
            our_model = RFR_ours(data=train_data, labels=train_labels, n_estimators=5, max_depth=5, min_samples_split=2, min_impurity_decrease=0, max_leaf_nodes=None, budget=None, criterion=MSE, splitter=BEST, solver=EXACT, random_state=seed, verbose=False)
            their_model = RFR_sklearn(n_estimators=5, criterion='squared_error', max_depth=5, min_samples_split=2, max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, n_jobs=-1, random_state=seed, verbose=0)    
        elif compare == "ERFR":
            our_model = ERFR_ours(data=train_data, labels=train_labels, n_estimators=5, max_depth=5, num_bins=None, min_samples_split=2, min_impurity_decrease=0, max_leaf_nodes=None, budget=None, criterion=MSE, splitter=BEST, solver=EXACT, random_state=seed, verbose=False)
            their_model = ERFR_sklearn(n_estimators=5, criterion='squared_error', max_depth=5, min_samples_split=2, max_features='auto', min_impurity_decrease=0.0, bootstrap=False, n_jobs=-1, random_state=seed, verbose=0)
        else:
            raise NotImplementedError("Need to decide what models to compare")
        
        our_model.fit()
        their_model.fit(pca_train_vecs_huge, pca_train_labels_huge)

        if compare == "RFC" or compare == "ERFC":
            our_train_acc = np.mean(our_model.predict_batch(train_data)[0] == train_targets)
            our_test_acc = np.mean(our_model.predict_batch(test_data)[0] == test_targets)
            their_train_acc = np.mean(their_model.predict(train_data) == train_targets)
            their_test_acc = np.mean(their_model.predict(test_data) == test_targets)
        elif compare == "RFR" or compare == "ERFR":
            our_train_acc = np.mean((our_model.predict_batch(train_data) - train_targets)**2)
            our_test_acc = np.mean((our_model.predict_batch(test_data) - test_targets)**2)
            their_train_acc = np.mean((our_model.predict_batch(train_data) - train_targets)**2)
            their_test_acc = np.mean((our_model.predict_batch(test_data) - test_targets)**2)

        print("(Ours) Train accuracy:", our_train_acc)
        print("(Ours) Test accuracy:", our_test_acc)
        print("(Theirs) Train accuracy:", their_train_acc)
        print("(Theirs) Test accuracy:", their_test_acc)
        print("-" * 30)

        our_train_accs.append(our_train_acc)
        our_test_accs.append(our_test_acc)
        their_train_accs.append(their_train_acc)
        their_test_accs.append(their_test_acc)
    
    our_avg_train = np.mean(our_train_accs)
    our_std_train = np.std(our_train_accs)

    our_avg_test = np.mean(our_test_accs)
    our_std_test = np.std(our_test_accs)
    
    their_avg_train = np.mean(their_train_accs)
    their_std_train = np.std(their_train_accs)
    
    their_avg_test = np.mean(their_test_accs)
    their_std_test = np.std(their_test_accs)
    
    # See if confidence intervals overlap
    overlap = np.abs(their_avg_test - our_avg_test) < their_std_test + our_std_test
    return overlap, our_avg_train, our_std_train, our_avg_test, our_std_test, their_avg_train, their_std_train, their_avg_test, their_std_test

In [69]:
train_data = pca_train_vecs
train_labels = ng_train.target
test_data = pca_test_vecs
test_labels = ng_test.target

compare_accuracies("RFC", train_data, train_labels, test_data, test_labels)

(Ours) Train accuracy: 0.8648648648648649
(Ours) Test accuracy: 0.7896213183730715
(Theirs) Train accuracy: 0.7931034482758621
(Theirs) Test accuracy: 0.7475455820476858
------------------------------
(Ours) Train accuracy: 0.8704566635601119
(Ours) Test accuracy: 0.7643758765778401
(Theirs) Train accuracy: 0.8443616029822927
(Theirs) Test accuracy: 0.7643758765778401
------------------------------
(Ours) Train accuracy: 0.875116495806151
(Ours) Test accuracy: 0.7601683029453016
(Theirs) Train accuracy: 0.7996272134203168
(Theirs) Test accuracy: 0.7545582047685835
------------------------------
(Ours) Train accuracy: 0.8564771668219944
(Ours) Test accuracy: 0.726507713884993
(Theirs) Train accuracy: 0.8285181733457595
(Theirs) Test accuracy: 0.7769985974754559
------------------------------
(Ours) Train accuracy: 0.8499534016775396
(Ours) Test accuracy: 0.729312762973352
(Theirs) Train accuracy: 0.8387698042870456
(Theirs) Test accuracy: 0.7629733520336606
-----------------------------

(True,
 0.8659832246039143,
 0.009922707663900228,
 0.7559607293127629,
 0.018232819074333793,
 0.8183597390493942,
 0.01963531888120251,
 0.7612903225806451,
 0.01223978918893207)

In [70]:
compare_accuracies("ERFC", train_data, train_labels, test_data, test_labels)

(Ours) Train accuracy: 0.7185461323392358
(Ours) Test accuracy: 0.6367461430575035
(Theirs) Train accuracy: 0.777260018639329
(Theirs) Test accuracy: 0.7307152875175316
------------------------------
(Ours) Train accuracy: 0.7315936626281454
(Ours) Test accuracy: 0.638148667601683
(Theirs) Train accuracy: 0.700838769804287
(Theirs) Test accuracy: 0.6900420757363254
------------------------------
(Ours) Train accuracy: 0.7073625349487418
(Ours) Test accuracy: 0.5974754558204769
(Theirs) Train accuracy: 0.7092264678471575
(Theirs) Test accuracy: 0.6774193548387096
------------------------------
(Ours) Train accuracy: 0.7073625349487418
(Ours) Test accuracy: 0.6199158485273493
(Theirs) Train accuracy: 0.6952469711090401
(Theirs) Test accuracy: 0.6605890603085554
------------------------------
(Ours) Train accuracy: 0.700838769804287
(Ours) Test accuracy: 0.6255259467040674
(Theirs) Train accuracy: 0.6421248835041938
(Theirs) Test accuracy: 0.6143057503506312
------------------------------

(True,
 0.7089468779123951,
 0.0210387168543773,
 0.6175315568022441,
 0.016865904357043132,
 0.6953401677539609,
 0.048951665274391376,
 0.6605890603085554,
 0.04692487467869127)

# Regression: 

### Weirdly, we get exactly the same results as sklearn for all seeds. We should investigate whether this is a bug

In [73]:
np.random.seed(0)

data, targets = fetch_california_housing(return_X_y=True)
random_idcs = np.random.choice(len(data), size=len(data),replace=False)
data = data[random_idcs]
targets = targets[random_idcs]

TRAIN_TEST_SPLIT = 16000
train_data = data[:TRAIN_TEST_SPLIT]
train_targets = targets[:TRAIN_TEST_SPLIT]

test_data = data[TRAIN_TEST_SPLIT:]
test_targets = targets[TRAIN_TEST_SPLIT:]

In [None]:
compare_accuracies("RFR", train_data, train_targets, test_data, test_targets)

(Ours) Train accuracy: 3.6047103904415625
(Ours) Test accuracy: 3.557280888823238
(Theirs) Train accuracy: 3.6047103904415625
(Theirs) Test accuracy: 3.557280888823238
------------------------------
(Ours) Train accuracy: 3.698837202030057
(Ours) Test accuracy: 3.649268394695702
(Theirs) Train accuracy: 3.698837202030057
(Theirs) Test accuracy: 3.649268394695702
------------------------------
(Ours) Train accuracy: 3.668971394724753
(Ours) Test accuracy: 3.6183697080805217
(Theirs) Train accuracy: 3.668971394724753
(Theirs) Test accuracy: 3.6183697080805217
------------------------------
(Ours) Train accuracy: 3.732563296199221
(Ours) Test accuracy: 3.6927617072229575
(Theirs) Train accuracy: 3.732563296199221
(Theirs) Test accuracy: 3.6927617072229575
------------------------------
(Ours) Train accuracy: 3.6404918646836975
(Ours) Test accuracy: 3.5838844263183107
(Theirs) Train accuracy: 3.6404918646836975
(Theirs) Test accuracy: 3.5838844263183107
------------------------------
(Ours

In [72]:
compare_accuracies("ERFR", train_data, train_labels, test_data, test_labels)

(Ours) Train accuracy: 0.21130813838301798
(Ours) Test accuracy: 0.23171692519662102
(Theirs) Train accuracy: 0.21130813838301798
(Theirs) Test accuracy: 0.23171692519662102
------------------------------


KeyboardInterrupt: 