This notebook is where Elizabeth is developing the code for the decision tree.

`hopkins-knowledge.csv` contains the knowledge base from [here](https://github.com/drdevinhopkins/20_Questions/blob/master/knowledge_base.csv); used only for development.

In [1]:
import pandas as pd
import numpy as np

In [44]:
kn = pd.read_csv('hopkins-knowledge.csv')

y = kn['Animal']
X = kn.loc[:, 'Hair':'Invertebrate']

print('There are {0} objects and {1} features for each object.'.format(y.shape[0], X.shape[1]))

There are 100 objects and 28 features for each object.


In [47]:
# kn[kn['Hair']==0]
X[X['Hair']==0]

Unnamed: 0,Hair,Feathers,Eggs,Milk,Airborne,Aquatic,Predator,Toothed,Backbone,Breathes,...,Tail,Domestic,Catsize,Mammal,Bird,Reptile,Fish,Amphibian,Insect,Invertebrate
2,0,0,1,0,0,1,1,1,1,0,...,1,0,0,0,0,0,1,0,0,0
7,0,0,1,0,0,1,0,1,1,0,...,1,1,0,0,0,0,1,0,0,0
8,0,0,1,0,0,1,1,1,1,0,...,1,0,0,0,0,0,1,0,0,0
11,0,1,1,0,1,0,0,0,1,1,...,1,1,0,0,1,0,0,0,0,0
12,0,0,1,0,0,1,1,1,1,0,...,1,0,0,0,0,0,1,0,0,0
13,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
14,0,0,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
15,0,0,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16,0,1,1,0,1,0,1,0,1,1,...,1,0,0,0,1,0,0,0,0,0
18,0,0,1,0,0,1,1,1,1,0,...,1,0,1,0,0,0,1,0,0,0


In [147]:
X.loc[:, 'Hair':'Predator']

Unnamed: 0,Hair,Feathers,Eggs,Milk,Airborne,Aquatic,Predator
0,1,0,0,1,0,0,1
1,1,0,0,1,0,0,0
2,0,0,1,0,0,1,1
3,1,0,0,1,0,0,1
4,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...
95,1,0,0,1,0,0,0
96,1,0,1,0,1,0,0
97,1,0,0,1,0,0,1
98,0,0,1,0,0,0,0


In [96]:
# One way: define a function that computes difference from 1, apply it to df, and then choose col with min diff
# Problem: maybe not so efficient, since has to run over all columns and then minimise
# But: might be better, since it uses .apply() instead of a for loop, so it might be faster
%%timeit
def dist_from_1(attrib_col):
    # To be applied to each column 
    counts = attrib_col.value_counts()
    
    if len(counts) == 2:  # i.e. if there are both 1s and 0s in the column
        ratio = counts[0] / counts[1] 
        return abs( 1 - ratio )  # diffc btwn 1 and ratio -- want to minimise this
    return -1
# .idxmin() returns index of the element with the minimum value; here, feature name
X.apply(dist_from_1, axis=0).idxmin()

35.6 ms ± 3.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [97]:
# Another way: use a for loop to go through each feature, and update minimum as you go 
%%timeit
def feat_nearest_1(df):
    curr_min = 10e10
    curr_min_feat = ''
    for feat in df.columns:
        counts = X[feat].value_counts()
        ratio = counts[0] / counts[1]
        dist_from_1 = abs( 1 - ratio )
        if dist_from_1 < curr_min:
            curr_min = dist_from_1
            curr_min_feat = feat
    return curr_min_feat
    
feat_nearest_1(X)

32.9 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
# Now let's see how they scale on a tenth of the rows

In [148]:
%%timeit
def dist_from_1(attrib_col):
    # To be applied to each column 
    counts = attrib_col.value_counts()
    if len(counts) == 2:  # i.e. if there are both 1s and 0s in the column
        ratio = counts[0] / counts[1] 
        return abs( 1 - ratio )  # diffc btwn 1 and ratio -- want to minimise this
    return -1
# .idxmin() returns index of the element with the minimum value; here, feature name
X.loc[:, 'Hair':'Predator'].apply(dist_from_1, axis=0).idxmin()

11 ms ± 531 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [149]:
%%timeit
def feat_nearest_1(df):
    curr_min = 10e10
    curr_min_feat = ''
    for feat in df.columns:
        counts = X[feat].value_counts()
        ratio = counts[0] / counts[1]
        dist_from_1 = abs( 1 - ratio )
        if dist_from_1 < curr_min:
            curr_min = dist_from_1
            curr_min_feat = feat
    return curr_min_feat
    
feat_nearest_1( X.loc[:, 'Hair':'Predator'] )

10.3 ms ± 811 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
