In [1]:
import numpy as np
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import _tree
from sklearn.datasets import load_iris
from sklearn import tree as sktree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import graphviz

# Face embeddings
X_train = pd.read_pickle('./data/Race_x_train.pkl').values
y_train = pd.read_pickle('./data/Race_y_train_so.pkl').values-1
X_test = pd.read_pickle('./data/Race_x_test.pkl').values
y_test = pd.read_pickle('./data/Race_y_test_so.pkl').values-1


In [2]:
# constants
n = 4  # sample number
x = X_test[n:n+1]
y = y_test[n]
n_classes = 4
n_features = 128
n_trees = 50

feature_names = ['f'+str(x) for x in range(n_features)]  # face embeddings

# Train model
model= RandomForestClassifier(random_state = 0, n_estimators=n_trees)
model.fit(X_train, y_train)
accuracy_score(y_test, model.predict(X_test))

0.8950064020486556

In [3]:
def tree_votes(x):
    votes = np.zeros(n_trees, dtype=int)
    for idx, tree in enumerate(model.estimators_):
        votes[idx] = tree.predict(x)
    return votes

def count_votes(x):
    votes = np.zeros(n_classes)
    for tree in model.estimators_:
        votes[int(tree.predict(x))] += 1
    return votes


In [4]:
votes = tree_votes(x)

# egyes fákon milyen featureök mentén haladunk végig
def recurse(idx, tree, node, f_list):
    if tree.feature[node] != _tree.TREE_UNDEFINED:
        # if threshold is close to our value, add it to list
        feature = tree.feature[node]
        threshold = tree.threshold[node]
        input_val = x[0,feature]
        diff = threshold-input_val
        # if condition is close to input value, explore both paths
        if (abs(diff) <= input_val*0.2):
            if (input_val <= threshold):
                # True (left)
                f_list.append((node, feature, 0.0))
                recurse(idx, tree, tree.children_left[node], f_list)
                del f_list[-1]
                # False (right)
                f_list.append((node, feature, diff))
                recurse(idx, tree, tree.children_right[node], f_list)
            else:
                # False (right)
                f_list.append((node, feature, 0.0))
                recurse(idx, tree, tree.children_right[node], f_list)
                del f_list[-1]
                # True (left)
                f_list.append((node, feature, diff))
                recurse(idx, tree, tree.children_left[node], f_list)

        # else go only one path
        elif (input_val <= threshold):
            recurse(idx, tree, tree.children_left[node], f_list)
        else:
            recurse(idx, tree, tree.children_right[node], f_list)
    else:
        output = np.argmax(tree.value[node]) 
        # if output is wrong
        if (output != y):
            info.append((idx, node, output, f_list.copy()))

In [5]:

info = []
for idx, tree in enumerate(model.estimators_):
    #skip incorrect trees
    pred = tree.predict(x)[0]
    if pred != y:
        continue
    feature_list = []
    t = tree.tree_
    recurse(idx, t, 0, feature_list)

# feature_counts = np.zeros(n_features, dtype='int')
for tuple in info:
    idx, outnode, output, line = tuple
    print(" Tree: {}, output: {}".format(idx, output))
    for data in line:
        node, fidx, diff = data
        feature = feature_names[fidx]
        # feature_counts[fidx] += 1
        if (diff != 0):
            print("\t{:4} diff: {:2.4}".format(feature, diff))
        

 Tree: 4, output: 3
	f10  diff: -0.0267
 Tree: 4, output: 2
	f88  diff: -0.04035
 Tree: 9, output: 3
	f77  diff: -0.02279
 Tree: 11, output: 3
	f93  diff: 0.002902
	f57  diff: -0.008323
	f82  diff: -0.007353
 Tree: 11, output: 3
	f93  diff: 0.002902
	f57  diff: -0.008323
	f91  diff: 0.02286
 Tree: 11, output: 3
	f93  diff: 0.002902
	f57  diff: -0.008323
	f91  diff: 0.02286
	f8   diff: 0.01115
 Tree: 13, output: 1
	f38  diff: 0.02086
 Tree: 13, output: 3
	f43  diff: -0.02736
 Tree: 17, output: 3
	f10  diff: -0.009361
 Tree: 24, output: 3
	f8   diff: 0.02538
 Tree: 25, output: 3
	f68  diff: -0.01591
	f82  diff: 0.001768
 Tree: 29, output: 3
	f43  diff: -0.03574
 Tree: 31, output: 3
	f43  diff: -0.01156
 Tree: 35, output: 3
	f88  diff: -0.0313
 Tree: 37, output: 3
	f105 diff: -0.02174
 Tree: 37, output: 1
	f105 diff: -0.02174
	f114 diff: 0.02431
 Tree: 39, output: 1
	f112 diff: -0.007024
 Tree: 41, output: 3
	f44  diff: 0.02301
 Tree: 42, output: 2
	f43  diff: -0.0262
 Tree: 47, output: 3

In [8]:
# feature módosítások
x_ = x.copy()

x_[:,10] += -0.0268  # Tree 4, Tree 17
x_[:,77] += -0.0228  # Tree 9
x_[:,43] += -0.02737  # Tree 13, 29, 31
x_[:,8] += 0.02539  # Tree 24
x_[:,88] += -0.0314  # Tree 35
x_[:,105] += -0.02175  # Tree 37
x_[:,44] += 0.02302  # Tree 41

print('tree votes:')
print(count_votes(x))
print(count_votes(x_))

tree votes:
[30.  4.  3. 13.]
[20.  4.  4. 22.]


In [7]:
print(model.predict(x))
print(model.predict(x_))

[0.]
[3.]


In [8]:
np.linalg.norm(x-x_)

0.06797640693652465