This notebook is where Elizabeth is developing the code for the decision tree.

`hopkins-knowledge.csv` contains the knowledge base from [here](https://github.com/drdevinhopkins/20_Questions/blob/master/knowledge_base.csv); used only for development.

In [2]:
import pandas as pd
import numpy as np

In [3]:
kn = pd.read_csv('hopkins-knowledge.csv')

y = kn['Animal']
X = kn.loc[:, 'Hair':'Invertebrate']

print('There are {0} objects and {1} features for each object.'.format(y.shape[0], X.shape[1]))

There are 100 objects and 28 features for each object.


In [4]:
def dist_from_1(feat_col):
    """
    Returns the absolute distance from 1 of the split cardinality ratio for the given column of X.
    """
    counts = feat_col.value_counts()
    if len(counts) == 2:  # i.e. if there are both 1s and 0s in the column
        ratio = counts[0] / counts[1] 
        return abs( 1 - ratio )
    return 10e10  # some arbitrarily large value; not the minimum


def feat_nearest_1(df):
    """
    Returns the feature (column name) of the passed-in dataframe with the split cardinality ratio nearest 1,
    as well as the value of that ratio.
    """
    return df.apply(dist_from_1).idxmin(), df.apply(dist_from_1).min()

feat_to_split_on, dist_from_one = feat_nearest_1(X)
feat_to_split_on
# Probably also want a feat_nearest_3 or something, a top 3 kind of thing, in case we have to choose another question 
# if the answer is 'unknown'


def rank_features(df):
    """
    Ranks all features in df by increasing absolute distance from 1 of the SCR.
    """
    return df.apply(dist_from_1).sort_values()

rank_features(X).index[0]  # returns index of 0th element, i.e. feature name
rank_features(X)[0]        # returns value of 0th element

0.18181818181818177

In [5]:
# print(feat_to_split_on, '?')
# answ = int( input() )
# X[X[feat_to_split_on] == answ]

# Proof of concept

In [6]:
def play(X, y, answers):
    
    feat_to_split_on, dist_from_one = feat_nearest_1(X)
    
    # As long as we have more than one row and more than one column, and
    # 10e10 is the flagged value, meaning that there is no more sensible split that can be made between features because
    # they all contain either all 0s or all 1s
        
    if len(X) > 1 and len(X.columns) > 1 and dist_from_one != 1000:

        print(feat_to_split_on, '?')
        answ = int( input() )
        
        answers[feat_to_split_on] = answ
        
        # Prune X to contain only those instances corresponding to the answer, and drop the feature
        X_new = X[X[feat_to_split_on] == answ].drop(columns=[feat_to_split_on])
        
#         print(X_new.index)
        
        play(X_new, y, answers)
    
    else:
        
        if len(X) == 1:
            guess = y[X.index]
        else:
            rd_guess_idx = np.random.choice(X.index)
            guess = y[rd_guess_idx]

        print('\n\nGUESS:', guess)
        print('ANSWERS:', answers)
        
#         return guess, answers  # problems with this for some reason

In [7]:
# play(X, y, dict())

In [8]:
# for gorilla, somehow the whole pandas series is printed out, while when the choice is random, this doesn't happen. why?? 
# play(X, y, dict())

# Non-deterministic

- DONE random choice of feature, if multiple ones have the same 1-SCR
- if multiple objects left at the end that cannot be distinguished, cycles through all of them randomly and asks in each case until one is guessed
- still only yes/no answers
- add question counter

In [None]:
def play2(X, y, answers):
    """
    Recursively splits knowledge base based on user input about whether target object matches the feature.
    """
    
    # Get list of features, ranked by distance of their SCR from 1 (i.e. sorted by increasing 1-SCR), and identify min.
    ranked_feats = rank_features(X)
    min_feat_diffc = ranked_feats[0]
    
    # Recursive case: As long as X as more than one column and more than one row and the min is not 1000 (= the flag
    # value for when the feature contains only 0s or only 1s).    
    if len(X) > 1 and len(X.columns) > 1 and min_feat_diffc != 10e10:

        # Check whether multiple features share the same min_feat_diffc.
        dupd_minimum = np.in1d(ranked_feats, min_feat_diffc).sum() != 1

        # If they do, identify all of the features with that min_feat_diffc and randomly choose between them.
        if dupd_minimum:
            min_feats = ranked_feats.index[ np.where(ranked_feats == min_feat_diffc) ]
            print('  Min feats:', min_feats)
            
            feat_to_split_on = np.random.choice(min_feats)
        else:
            feat_to_split_on = ranked_feats.index[0]
        
        # BUILD IN ANNA'S FEATURE NAME -> NATURAL LANGUAGE QUESTION FUNCTION HERE
        print('\n', feat_to_split_on, '?')  
        answ = int( input() )
        
        answers[feat_to_split_on] = answ
        
        # Prune X to contain only those instances corresponding to the answer, and drop the feature
        X_new = X[X[feat_to_split_on] == answ].drop(columns=[feat_to_split_on])
        
        play2(X_new, y, answers)
    
    # Base case.
    else:
        
        # If there is only one row left in the data, only only one object available to guess.
        if len(X) == 1:
            guess = y[X.index]
            
        # If there are many candidates, cycle through all of them and ask about each one.
        else:
            
            # Jupyter notebook keeps getting hung up when in this loop if I execute this cell multiple times.
            # Ah. I wonder if it's because I'm doing in-place changes to things that I shouldn't be changing, so it gets
            # stuck in infinite loops.
            rem_idcs = list(X.index)
            print('\nRemaining indices:', rem_idcs)
            rd_guess_idx = np.random.choice(rem_idcs)
            guess = y[rd_guess_idx]

        print('\n\nGUESS:', guess)
#         print('ANSWERS:', answers)
        
        print('Correct?')
        guessed_right = int( input() )
        
        msg = 'oh yeah! we rock' if guessed_right == 1 else 'dangit'
        print(msg)
    
play2(X, y, dict())

In [None]:
print( y[y.index.isin([24, 30])] )