This notebook is where Elizabeth is developing the code for the decision tree.

`hopkins-knowledge.csv` contains the knowledge base from [here](https://github.com/drdevinhopkins/20_Questions/blob/master/knowledge_base.csv); used only for development.

**HOW TO PLAY: Enter 0 for no and 1 for yes.**

In [1]:
import pandas as pd
import numpy as np

In [2]:
kn = pd.read_csv('hopkins-knowledge.csv')

y = kn['Animal']
X = kn.loc[:, 'Hair':'Invertebrate']

print('There are {0} objects and {1} features for each object.'.format(y.shape[0], X.shape[1]))

There are 100 objects and 28 features for each object.


In [4]:
def dist_from_1(feat_col):
    """
    Returns the absolute distance from 1 of the split cardinality ratio for the given column of X.
    """
    counts = feat_col.value_counts()
    if len(counts) == 2:  # i.e. if there are both 1s and 0s in the column
        ratio = counts[0] / counts[1] 
        return abs( 1 - ratio )
    return 10e10  # some arbitrarily large value; not the minimum


def rank_features(df):
    """
    Ranks all features in df by increasing absolute distance from 1 of the SCR.
    """
    return df.apply(dist_from_1).sort_values()

# rank_features(X).index[0]  # returns index of 0th element, i.e. feature name
# rank_features(X)[0]        # returns value of 0th element

0.18181818181818177

# It lives!

- TODO: Incorporate handling for unknown answers, i.e. not 0 or 1?

### TODO for Anna:

In [None]:
def ask_about_feature(feat_name, counter):
    """
    ANNA: Modify this function to print out a natural language question based on the feature name,
    e.g. "Does it have wings?"
    
    Arg:
        feat_name: string, name of feature to split dataset on
        counter: int, the count of the current question.
    Prints:
        A string, the natural language question asking about that feature.
    Returns:
        Nothing.
    """
    question = feat_name+'?'
    print('Q'+str(counter)+': '+question)


def ask_about_object(obj_name, counter):
    """
    ANNA: Modify this function to print out a natural language question based on the object name, 
    e.g. "Are you thinking of an ocelot?"
    
    Arg:
        obj_name: string, name of object to guess.
        counter: int, the count of the current question.
    Prints:
        A string, the natural language question guessing that object.
    Returns:
        Nothing.
    """
    question = obj_name+'?'
    print('Q'+str(counter)+': '+question)
    

### TODO for Rodrigo:

- I only made one `endgame_lose()` function at all of the points in the decision tree where the game is lost. You might want to split this up into something like `endgame_lose_questionlimit()` and `endgame_lose_guessedwrong()` etc., since that might make it more clear what action needs to be taken

In [None]:
def endgame_lose(answers):
    """
    RODRIGO: If the game is lost, we will need to figure out why (was the 20Q limit reached? Or was the user's 
    object not in the knowledge base?) and take action based on that. The code to add in unknown objects can
    be incorporated here.
    I've set the code up so that the answers dictionary ends up here, hopefully that makes your life easier.
    
    Arg:
        answers: a dictionary where the keys are features and the values are the user's answers to those features
    """
    print('dangit')
    print('Answers:', answers)

        
def endgame_win():
    """
    RODRIGO: Does something in the event that the game was won.
    """
    print('oh yeah! I rock')

Done by Elizabeth:

In [55]:
def split_df_on_feature(df, feature, answer):
    """
    Returns subset of df where df[feature]==answer and drops feature from columns in df.
    
    Args:
        df: pandas dataframe with features as columns, populated by 0s and 1s, one row per instance
        feature: string, the column name to split on
        answer: int, 0 or 1, reflecting which subset of the dataframe to keep
    Returns:
        pandas dataframe with features as columns (subset of df).
    """
    return df[df[feature] == answer].drop(columns=[feature])



def ask_and_split_on_answer(feature, counter, df, answers):
    """
    Prints question about the supplied feature, gets the answer, and then splits the dataset, returning
    only those instances where the answer holds.
    Also adds the feature and answer to the answers dictionary.
    
    Args:
        feature: a string, a column in df
        counter: int, the count of the current question.
        df: pandas dataframe with features as columns, populated by 0s and 1s, one row per instance
        answers: a dictionary where the keys are features and the values are the user's answers to those features
    Returns:
        pandas dataframe with features as columns (subset of df).
        answers dict with feature added.
    """
    ask_about_feature(feature, counter)
    answ = int( input() )
    answers[feature] = answ
    return split_df_on_feature(df, feature, answ), answers
    
    
def guess_rem_objs(y, X_idcs, counter, answers):
    """
    If dataset cannot be split by features anymore, but multiple objects still remain, this function goes through
    them in a random order, guessing each in a randomised order until endgame.
    
    Args:
        y: pandas series, all objects in dataset
        X_idcs: 'pandas.core.indexes.numeric.Int64Index', the remaining indices to choose from (all non-candidates 
                having been pruned)
        counter: int, the count of the current question.
        answers: a dictionary where the keys are features and the values are the user's answers to those features
    Returns:
        Nothing.
    """
   
    # Subset the ys based on X_idcs and shuffle them, so the guessing will happen in a random order.
    ys_to_guess = y[y.index.isin(X_idcs)]
    ys_to_guess = ys_to_guess.sample(frac=1, random_state=3)

    # Go through ys_to_guess and ask about each object, checking the counter each time to make sure we don't exceed
    # 20 questions here either. If guessed correctly, enter endgame_win and stop looping.
    for guess in ys_to_guess:
        if counter <= 20:
            ask_about_object(guess, counter)
            counter += 1
            answ = int( input() )
            if answ == 1:
                endgame_win()
                break
    
    # If we made it out of the for loop and the final answer isn't 1, that means we lost.
    if not answ:
        endgame_lose(answers)
        return

    
def play3(X, y, counter, answers):
    """
    Recursively splits knowledge base based on user input about whether target object matches the feature.
    
    Args:
        X: pandas dataframe with features as columns, populated by 0s and 1s, one row per instance
        y: pandas series, all objects in dataset
        counter: int, the count of the current question.
        answers: a dictionary where the keys are features and the values are the user's answers to those features
    Returns:
        nothing.
    """
    
    # =============================
    # BASE CASE 0: counter > 20
    # =============================
    if counter > 20:
        print('TOO MANY QUESTIONS!')
        endgame_lose(answers)
        return
    
    # =============================
    # BASE CASE 1: Only one row left in the data, so only one object available to guess.
    # =============================
    
    if len(X) == 1:
        print('ONLY ONE OBJECT LEFT!')
        guess = y[X.index].to_string(index=False)  # (all this machinery required to print pd.Series as str, sigh)
        ask_about_object(guess, counter)
        answ = int( input() )
        endgame_win() if answ == 1 else endgame_lose(answers)
        return
    
    # =============================
    # BASE CASE 2: Only one feature left in the data (have asked about all other ones). Will need to ask about that feature, 
    # subset the data correspondingly, and then go through all remaining objects.
    # =============================
    
    if len(X.columns) == 1:
        print('ONLY ONE FEATURE LEFT!')
        feature_to_split_on = X.columns[0]
        X_bc2, answers = ask_and_split_on_answer(feature_to_split_on, counter, X, answers)
        counter += 1
        
        # If there are no remaining objects to guess after splitting the data on this feature, then endgame_lose().
        if len(X_bc2.index) == 0:
            print('NO OBJECTS LEFT TO GUESS!')
            endgame_lose(answers)
            return
        # Otherwise, cycle through all remaining objects until endgame.
        else:
            guess_rem_objs(y, X_bc2.index, counter, answers)  # includes endgame
            return
    
    # =============================
    # BASE CASE 3: All features have either all 0s or all 1s for all remaining objects (observable from the value of
    # min_feat_diffc, which is set to the flag value 10e10 if corresponding feature contains all 0s or all 1s). 
    # Can't divide dataset any more, so will just need to cycle through all remaining objects until endgame.
    # =============================
    
    # Get list of features, ranked by distance of their SCR from 1 (i.e. sorted by increasing 1-SCR), and identify min.
    ranked_feats = rank_features(X)
    min_feat_diffc = ranked_feats[0]
    
    if min_feat_diffc == 10e10:
        print('NO MORE DISTINGUISHING FEATURES!')
        guess_rem_objs(y, X.index, counter, answers)  # includes endgame
        return
    
    # =============================
    # RECURSIVE CASE: If we get this far, that means we didn't fall into any of the base cases, so the game can be played!
    # =============================
    
    # For a lil sprinkling of nondeterminism, we want to randomly choose between multiple features, if they are equally
    # good at splitting the dataset, i.e. if multiple features share the same min_feat_diffc.
    dupd_minimum = np.in1d(ranked_feats, min_feat_diffc).sum() != 1
    if dupd_minimum:
        min_feats = ranked_feats.index[ np.where(ranked_feats == min_feat_diffc) ]
        print('COMPETING FEATURES!', min_feats)
        feat_to_split_on = np.random.choice(min_feats)
    else:
        feat_to_split_on = ranked_feats.index[0]
    
#     print(ranked_feats, '\n')  # Sanity check, can rm this later
    X_new, answers = ask_and_split_on_answer(feat_to_split_on, counter, X, answers)
    counter += 1
    play3(X_new, y, counter, answers)

    
play3(X, y, counter=1, answers=dict())