In [13]:
# Download all datasets from sklearn
from sklearn.datasets import *
for m in [fetch_olivetti_faces, fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing]:
    print(m)
    try:
        all_ = m()
        train = m(subset='train')
        test = m(subset='test')
    except:
        pass

<function fetch_olivetti_faces at 0x7fef48663940>
<function fetch_20newsgroups_vectorized at 0x7fef6889ee50>
<function fetch_lfw_people at 0x7fef888425e0>
<function fetch_lfw_pairs at 0x7fef88842700>
<function fetch_covtype at 0x7fef6885be50>
<function fetch_rcv1 at 0x7fef688f99d0>
<function fetch_kddcup99 at 0x7fef88842040>
<function fetch_california_housing at 0x7fef688f9820>


In [83]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

# Download the data from two categories
cats = ['alt.atheism', 'sci.space']
ng_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=cats)
ng_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=cats)


vectorizer = TfidfVectorizer()
trans = vectorizer.fit(ng_train.data)
train_vectors = vectorizer.transform(ng_train.data)
test_vectors = vectorizer.transform(ng_test.data)
print("Number of datapoints: ", len(ng_train.data))
print("Number of features: ", train_vectors.shape[1])
print("Balance: ", np.sum(ng_train.target) / len(ng_train.target)) # 55-45, roughly balanced

N_COMPONENTS=100
pca = PCA(n_components=N_COMPONENTS)
pca.fit(train_vectors.toarray())
pca_train_vecs = pca.transform(train_vectors.toarray())
pca_test_vecs = pca.transform(test_vectors.toarray())

Number of datapoints:  1073
Number of features:  18217
Balance:  0.5526561043802423


In [86]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(pca_train_vecs,ng_train.target)
print("Decision Tree Accuracy:", np.mean(dt.predict(pca_test_vecs) == ng_test.target))

#cross_val_score(dt, pca_train_vecs, ng_train.target, cv=10).mean()

Accuracy: 0.7741935483870968


In [90]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(pca_train_vecs,ng_train.target)
print("Random Forest Accuracy:", np.mean(rf.predict(pca_test_vecs) == ng_test.target))

#cross_val_score(rf, pca_train_vecs, ng_train.target, cv=10).mean()

Random Forest Accuracy: 0.8008415147265077


In [108]:
from data_structures.tree_classifier import TreeClassifier
import utils.utils

classes_arr = np.unique(ng_train.target)
classes = utils.utils.class_to_idx(classes_arr)

In [109]:
tc = TreeClassifier(data=pca_train_vecs, labels=ng_train.target, max_depth=5, classes=classes)
tc.fit()
print("Train accuracy:", np.mean(tc.predict_batch(pca_train_vecs)[0] == ng_train.target))
print("Test accuracy:", np.mean(tc.predict_batch(pca_test_vecs)[0] == ng_test.target))
print("Num queries:", tc.num_queries)

Calculated split with 2073 queries
Calculated split with 658 queries
Calculated split with 1415 queries
Calculated split with 402 queries
Calculated split with 1013 queries
Calculated split with 86 queries
Calculated split with 827 queries
Calculated split with 33 queries
Calculated split with 269 queries
Calculated split with 631 queries
Calculated split with 27 queries
Calculated split with 227 queries
Calculated split with 42 queries
Calculated split with 3 queries
Calculated split with 824 queries
Calculated split with 12 queries
Calculated split with 15 queries
Calculated split with 70 queries
Calculated split with 16 queries
Calculated split with 278 queries
Calculated split with 253 queries
Calculated split with 72 queries
Calculated split with 81 queries
Calculated split with 30 queries
Calculated split with 3 queries
Calculated split with 13 queries
Calculated split with 2 queries
Calculated split with 2 queries
Calculated split with 276 queries
Calculated split with 1 queries

In [110]:
tc = TreeClassifier(data=pca_train_vecs, labels=ng_train.target, max_depth=5, classes=classes, solver="EXACT")
tc.fit()
print("Train accuracy:", np.mean(tc.predict_batch(pca_train_vecs)[0] == ng_train.target))
print("Test accuracy:", np.mean(tc.predict_batch(pca_test_vecs)[0] == ng_test.target))
print("Num queries:", tc.num_queries)

Calculated split with 1073 queries
Calculated split with 358 queries
Calculated split with 715 queries
Calculated split with 94 queries
Calculated split with 621 queries
Calculated split with 182 queries
Calculated split with 439 queries
Calculated split with 110 queries
Calculated split with 72 queries
Calculated split with 30 queries
Calculated split with 64 queries
Calculated split with 331 queries
Calculated split with 27 queries
Calculated split with 12 queries
Calculated split with 15 queries
Calculated split with 46 queries
Calculated split with 18 queries
Calculated split with 327 queries
Calculated split with 4 queries
Calculated split with 3 queries
Calculated split with 436 queries
Calculated split with 2 queries
Calculated split with 28 queries
Calculated split with 311 queries
Calculated split with 16 queries
Calculated split with 13 queries
Calculated split with 2 queries
Calculated split with 1 queries
Calculated split with 11 queries
Fitting finished
Train accuracy: 0.8