In [1]:
import pandas as pd
import numpy as np

kn = pd.read_csv('hopkins-knowledge.csv')

y = kn['Animal']
X = kn.loc[:, 'Hair':'Invertebrate']
print('There are {0} objects and {1} features for each object.'.format(y.shape[0], X.shape[1]))

There are 100 objects and 28 features for each object.


In [2]:
def ask_about_feature(feat_name, counter):
    """
    TODO: Modify this function to print out a natural language question based on the feature name,
    e.g. "Does it have wings?"
    
    Arg:
        feat_name: string, name of feature to split dataset on
        counter: int, the count of the current question.
    Prints:
        A string, the natural language question asking about that feature.
    Returns:
        Nothing.
    """
    question = feat_name+'?'
    print('Q'+str(counter)+': '+question)


def ask_about_object(obj_name, counter):
    """
    TODO: Modify this function to print out a natural language question based on the object name, 
    e.g. "Are you thinking of an ocelot?"
    
    Arg:
        obj_name: string, name of object to guess.
        counter: int, the count of the current question.
    Prints:
        A string, the natural language question guessing that object.
    Returns:
        Nothing.
    """
    question = obj_name+'?'
    print('Q'+str(counter)+': '+question)
    

def dist_from_1(feat_col):
    """
    Returns the absolute distance from 1 of the split cardinality ratio for the given column of X.
    """
    counts = feat_col.value_counts()
    if len(counts) == 2:  # i.e. if there are both 1s and 0s in the column
        ratio = counts[0] / counts[1] 
        return abs( 1 - ratio )
    return np.nan  # Features that get NaNs are filtered out later :)


def rank_features(df):
    """
    Ranks all features in df by increasing absolute distance from 1 of the SCR.
    """
    return df.apply(dist_from_1).sort_values()


def get_distinguishing_feats(X):
    """
    Ranks the features in X in ascending order of abs(1-SCR) and filters out those that contain either all 0s or all 1s
    (i.e. those that cannot be used to distinguish between objects).
    
    Arg:
        X: pandas dataframe with features as columns, populated by 0s and 1s, one row per instance
    Returns:
        A pandas series of features ranked by abs(1-SCR) ascending, with non-distinguishing features removed.
    """
    # Rank the features, drop the NaNs that were put there by dist_from_1(), and return what remains.
    r = rank_features(X)
    distinguishing_feats = r.dropna()
    return distinguishing_feats


def sample_feature(X, distinguishing_feats):
    """
    Ranks the features in X, creates a probability distribution from the ranking, and samples a feature
    according to this probability distribution, returning this as the feature to ask about. Also returns
    the number of distinguishing features (i.e. those that contain both 0 and 1), as a check for when to 
    stop trying to split on features.
    
    Arg:
        X: pandas dataframe with features as columns, populated by 0s and 1s, one row per instance
        distinguishing_feats: pandas series of features ranked by abs(1-SCR) ascending, with non-distinguishing 
                              features removed.
    Returns:
        sampled_feat: A string, the sampled feature to ask about.
    """

    # Get the max value of the distinguishing features (this is the final element, since they're ranked ascending).
    max_val = distinguishing_feats[-1]

    # Subtract each value in the series from max_val+1; now the features will be sorted descending, and the best features
    # to split on will have the highest values.
    # (the +1 is there because otherwise the final feature will have probability 0, and we still want it to be eligible,
    # if improbable)
    ranked_feats_transf = max_val - distinguishing_feats + 1 

    # Convert to a probability distribution by dividing by the sum of all observations.
    feat_prob_dist = ranked_feats_transf / ranked_feats_transf.sum()

    # Sample one feature from this distribution and return that feature.
    sampled_feat = np.random.choice( feat_prob_dist.index, 1, p = feat_prob_dist )
    sampled_feat = str(sampled_feat[0])
    
    return sampled_feat


def split_df_on_feature(df, feature, answer):
    """
    Returns subset of df where df[feature]==answer and drops feature from columns in df.
    
    Args:
        df: pandas dataframe with features as columns, populated by 0s and 1s, one row per instance
        feature: string, the column name to split on
        answer: int, 0 or 1, reflecting which subset of the dataframe to keep
    Returns:
        pandas dataframe with features as columns (subset of df).
    """
    return df[df[feature] == answer].drop(columns=[feature])

    
# def ask_and_process_answer1(feature, counter, df, answers):
#     """
#     Prints question about the supplied feature and gets the answer.
#     If the answer is 1 or 0 (yes or no), splits the dataset, returning
#     only those instances where the answer holds.
#     If the answer is 2 (unknown), does not split the dataset, but removes
#     that feature from it.
#     Also adds the feature and answer to the answers dictionary.
    
#     Args:
#         feature: a string, a column in df
#         counter: int, the count of the current question.
#         df: pandas dataframe with features as columns, populated by 0s and 1s, one row per instance
#         answers: a dictionary where the keys are features and the values are the user's answers to those features
#     Returns:
#         pandas dataframe with features as columns.
#         answers dict with feature added.
#     """
#     ask_about_feature(feature, counter)
#     answ = int( input() )
    
#     # Check for bad input.
#     while answ not in set([0, 1, 2]):
#         print('Please give valid input (0=no, 1=yes, 2=unknown).')
#         answ = int( input() )
    
#     # Add answer to the answers database (even if it's unknown... Rodrigo can choose how to deal with those)
#     answers[feature] = answ
    
#     # Drop the feature from the dataset without splitting if the answer is 2; else, split on feature.
#     if answ == 2:
#         return df.drop(columns=[feature]), answers
#     else:
#         return split_df_on_feature(df, feature, answ), answers
    
    
def guess_rem_objs(y, X_idcs, counter, answers):
    """
    If dataset cannot be split by features anymore, but multiple objects still remain, this function goes through
    them in a random order, guessing each in a randomised order until endgame.
    
    Args:
        y: pandas series, all objects in dataset
        X_idcs: 'pandas.core.indexes.numeric.Int64Index', the remaining indices to choose from (all non-candidates 
                having been pruned)
        counter: int, the count of the current question.
        answers: a dictionary where the keys are features and the values are the user's answers to those features
    Returns:
        Nothing.
    """
   
    # Subset the ys based on X_idcs and shuffle them, so the guessing will happen in a random order.
    ys_to_guess = y[y.index.isin(X_idcs)]
    ys_to_guess = ys_to_guess.sample(frac=1, random_state=3)

    # Go through ys_to_guess and ask about each object, checking the counter each time to make sure we don't exceed
    # 20 questions here either. If guessed correctly, enter endgame_win and stop looping.
    for guess in ys_to_guess:
        if counter <= 20:
            ask_about_object(guess, counter)
            counter += 1
            answ = int( input() )
            if answ == 1:
                endgame_win()
                break
    
    # If we made it out of the for loop and the final answer isn't 1, that means we lost.
    if not answ:
        quick_endgame_lose()
        return

    
def quick_endgame_lose():
    print('dangit')
    
    
def endgame_win():
    print('oh yeah! I rock')
    

In [43]:
def init_animal_probdist(y):
    """
    Initialises the prior probability distribution with a uniform prior of 20 across all animals.
    
    Arg:
        y: pandas series, all objects in dataset
    Returns:
        A series with the animals as index and their uniform prior probability as value
    """
    # Convert y into a dataframe (with one column: Animal), add column for probability, and turn Animal col into index
    # and save again as series (will be easier to deal with later).
    y_df = pd.DataFrame(y)
    y_df['prob'] = np.repeat(20, len(y))
    y_df = y_df.set_index('Animal')['prob']
    
    # Note to self: Get list of animals with y_df.index, and get probability of animal by e.g. y_df.loc['antelope'].
    return y_df
    

def update_animal_probdist(y_probdist, kn, feature_asked, answ):
    """
    Given a user's answers to a question about a particular feature, update the probability distribution over animals.
    
    Args:
        y_probdist: pandas series with the animals as index and their probability as value
        kn: the complete knowledge base
        feature_asked: a string, the feature just asked about
        answ: an integer, the user's response
    Returns:
        Pandas dataframe, y_probdist updated to reflect the answer provided by the user 
    """
    
    # Set the index of kn to the Animal column for easy combination with the probability distribution.
    kn = kn.set_index('Animal')
    
    # Extract the column in kn corresponding to the feature we asked about.
    kn_col = pd.Series(kn[feature_asked])  # Copy this column before modifying it, so that we don't modify kn!
    
    # ORIGINAL PLAN: SUBTRACTING IF INCOMPATIBLE
    # If the user's answer is feature_asked == 1, then all animals with feature_asked == 0 get -1 in y_probdist.
    # To accomplish this, subtract 1 from kn_col to take 1 to 0 and 0 to -1, and then add that to y_probdist.
    # If the user's answer is feature_asked == 0, then all animals with feature_asked == 1 get -1 in y_probdist.
    # To accomplish this, subtract k_col from y_probdist.
    
    # NEW PLAN: HALVING IF INCOMPATIBLE
    # To do this, replace all wrong answers with 2s and correct answers by 1s, 
    # and divide by kn_col (divides mismatches by 2 and matches by 1, i.e. matches stay same)
    if answ == 1:
        kn_col = np.where(kn_col == 0, 2, kn_col) 
        return y_probdist / kn_col
    elif answ == 0:
        kn_col = np.where(kn_col == 1, 2, 1)
        return y_probdist / kn_col
    
test_probdist = init_animal_probdist(y)
update_animal_probdist(test_probdist, kn, 'Hair', 0)

AttributeError: 'Series' object has no attribute 'bar'

In [50]:
def ask_and_get_answer(feature, counter):
    """
    Prints question about the supplied feature and gets the answer (checks validity of input).
    
    Args:
        feature: a string, a column in df
        counter: int, the count of the current question.
    Returns:
        integer in 0, 1, 2 representing the user's answer
    """
    ask_about_feature(feature, counter)
    answ = int( input() )
    
    # Check for bad input.
    while answ not in set([0, 1, 2]):
        print('Please give valid input (0=no, 1=yes, 2=unknown).')
        answ = int( input() )
        
    return answ


def process_answer(feature, answ, X, kn, answers, y_probdist):
    """
    Splits X based on user's answer, adds the answer to the answers dictionary, and modifies the probability
    distribution over animals based on the answer.
    
    Args:
        feature: a string, a column in df
        answ: integer in 0, 1, 2 representing the user's answer
        X: pandas dataframe with features as columns, populated by 0s and 1s, one row per instance
        kn: pandas dataframe, the full knowledge base
        answers: a dictionary where the keys are features and the values are the user's answers to those features
        y_probdist: pandas series with the animals as index and their probability as value
    Returns:
        pandas dataframe with features as columns.
        answers dict with feature added.
        y_probdist with probabilities adjusted.
    """
    # Add answer to the answers database (even if it's unknown... Rodrigo can choose how to deal with those)
    answers[feature] = answ
    
    # If the answer is 0 or 1, split dataset, returning only those instances where the answer holds, and update
    # the probability distribution over animals accordingly.
    if answ == 0:
        y_probdist = update_animal_probdist(y_probdist, kn, feature, 0)
        print( y_probdist.sort_values(ascending=False) )
        return split_df_on_feature(X, feature, answ), answers, y_probdist
    elif answ == 1:
        y_probdist = update_animal_probdist(y_probdist, kn, feature, 1)
        print( y_probdist.sort_values(ascending=False) ) 
        return split_df_on_feature(X, feature, answ), answers, y_probdist
    
    # If the answer is 2, only remove the feature from the dataset; don't split dataset and don't update probdist.
    else:
        return X.drop(columns=[feature]), answers, y_probdist

    
def ask_and_process_answer(feature, counter, df, answers, y_probdist):
    """
    Prints question about the supplied feature and gets the answer.
    If the answer is 1 or 0 (yes or no), splits the dataset, returning only those instances where the answer holds.
    If the answer is 2 (unknown), does not split the dataset, but removes
    that feature from it.
    Also adds the feature and answer to the answers dictionary.
    
    Args:
        feature: a string, a column in df
        counter: int, the count of the current question.
        X: pandas dataframe with features as columns, populated by 0s and 1s, one row per instance
        answers: a dictionary where the keys are features and the values are the user's answers to those features
        y_probdist: pandas series with the animals as index and their probability as value
    Returns:
        pandas dataframe with features as columns.
        answers dict with feature added.
    """
    ask_about_feature(feature, counter)
    answ = int( input() )
    
    # Check for bad input.
    while answ not in set([0, 1, 2]):
        print('Please give valid input (0=no, 1=yes, 2=unknown).')
        answ = int( input() )
    
    # Add answer to the answers database (even if it's unknown... Rodrigo can choose how to deal with those)
    answers[feature] = answ
    
    # If the answer is 0,
    if answ == 0:
        return split_df_on_feature(df, feature, answ), answers
    elif answ == 1:
        return split_df_on_feature(df, feature, answ), answers
    else:
        return df.drop(columns=[feature]), answers

In [39]:
def play_demo(kn, X, y, y_probdist, counter, answers):
    """
    Recursively bisects knowledge base based on user input about whether target object matches the feature.
    
    Args:
        kn: pandas dataframe, the full knowledge base
        X: pandas dataframe with features as columns, populated by 0s and 1s, one row per instance
        y: pandas series, all objects in dataset
        y_probdist: pandas series with the animals as index and their probability as value
        counter: int, the count of the current question.
        answers: a dictionary where the keys are features and the values are the user's answers to those features
    Returns:
        nothing.
    """
    
    # =============================
    # BASE CASE 0: counter > 20
    # =============================
    if counter > 20:
#         print('TOO MANY QUESTIONS!')
        quick_endgame_lose()
        return
    
    # =============================
    # BASE CASE 1: Only one row left in the data, so only one object available to guess.
    # =============================
    
    if len(X) == 1:
#         print('ONLY ONE OBJECT LEFT!')
        guess = y[X.index].to_string(index=False)  # (all this machinery required to print pd.Series as str, sigh)
        ask_about_object(guess, counter)
        answ = int( input() )
        endgame_win() if answ == 1 else quick_endgame_lose()
        return
    
    # =============================
    # BASE CASE 2: Only one feature left in the data (have asked about all other ones). Will need to ask about that feature, 
    # subset the data correspondingly, and then go through all remaining objects.
    # =============================
    
    if len(X.columns) == 1:
#         print('ONLY ONE FEATURE LEFT!')
        feature_to_split_on = X.columns[0]
        answ = ask_and_get_answer(feature_to_split_on, counter)
        X_bc2, answers, y_probdist = process_answer(feature_to_split_on, answ, X, kn, answers, y_probdist)
#         X_bc2, answers = ask_and_process_answer(feature_to_split_on, counter, X, answers, y_probdist)
        counter += 1
        
        # If there are no remaining objects to guess after splitting the data on this feature, then endgame_lose().
        if len(X_bc2.index) == 0:
#             print('NO OBJECTS LEFT TO GUESS!')
            quick_endgame_lose()
            return
        # Otherwise, cycle through all remaining objects until endgame.
        else:
            guess_rem_objs(y, X_bc2.index, counter, answers)  # includes endgame
            return
    
    # =============================
    # BASE CASE 3: There are no more distinguishing features, so the dataset can't be divided anymore. 
    # Will just need to cycle through all remaining objects until endgame.
    # =============================

    disting_feats = get_distinguishing_feats(X)
    
    # Count the distinguishing features in X (i.e. those that aren't all 0s or all 1s) and cycle through objects 
    # if there are none.
    if len( disting_feats ) == 0:
#         print('NO MORE DISTINGUISHING FEATURES!')
        guess_rem_objs(y, X.index, counter, answers)  # includes endgame
        return
    
    
    # =============================
    # RECURSIVE CASE: If we get this far, that means we didn't fall into any of the base cases, so the game can be played!
    # =============================

#     print(disting_feats, '\n') # Sanity check, can rm this later
#     print(X, '\n')
    
    # Sample a feature disting_feats proportional to how well it splits the data in X and ask about it.
    feature_to_split_on = sample_feature(X, disting_feats)
    answ = ask_and_get_answer(feature_to_split_on, counter)
    X_new, answers, y_probdist = process_answer(feature_to_split_on, answ, X, kn, answers, y_probdist)
#     X_new, answers = ask_and_process_answer(feature_to_split_on, counter, X, answers, y_probdist)
    counter += 1
    play_demo(kn, X_new, y, y_probdist, counter, answers)

In [51]:
y_probdist = init_animal_probdist(y)

play_demo(kn, X, y, y_probdist, counter=1, answers=dict())

Q1: Nlegs_0?
0
Animal
wren        20.0
gnat        20.0
gorilla     20.0
gull        20.0
hamster     20.0
            ... 
herring     10.0
seawasp     10.0
seasnake    10.0
seahorse    10.0
slowworm    10.0
Name: prob, Length: 100, dtype: float64
Q2: Bird?
0
Animal
mongoose    20.0
goat        20.0
hamster     20.0
hare        20.0
honeybee    20.0
            ... 
haddock     10.0
gull        10.0
slowworm    10.0
piranha     10.0
wren        10.0
Name: prob, Length: 100, dtype: float64
Q3: Predator?
0
Animal
moth        20.0
squirrel    20.0
gorilla     20.0
goat        20.0
gnat        20.0
            ... 
herring      5.0
hawk         5.0
rhea         5.0
pitviper     5.0
seal         5.0
Name: prob, Length: 100, dtype: float64
Q4: Domestic?
0
Animal
flea        20.0
termite     20.0
elephant    20.0
fruitbat    20.0
deer        20.0
            ... 
porpoise     5.0
rhea         5.0
gull         5.0
herring      5.0
seasnake     5.0
Name: prob, Length: 100, dtype: float64
Q5: N