Making 20Q into a class.

**HOW TO PLAY: Enter 0 for no, 1 for yes, 2 for unknown.**

In [1]:
import pandas as pd
import numpy as np

In [2]:
knowledge_base = pd.read_csv('hopkins-knowledge.csv')
# knowledge_base = pd.read_csv('20q-data.csv')

# knowledge_base.loc[:, knowledge_base.columns != 'Animal']  # how to select all features besides 'Animal'
knowledge_base.info(verbose=True)  # making sure that every column is an integer

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 29 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Animal        100 non-null    object
 1   Hair          100 non-null    int64 
 2   Feathers      100 non-null    int64 
 3   Eggs          100 non-null    int64 
 4   Milk          100 non-null    int64 
 5   Airborne      100 non-null    int64 
 6   Aquatic       100 non-null    int64 
 7   Predator      100 non-null    int64 
 8   Toothed       100 non-null    int64 
 9   Backbone      100 non-null    int64 
 10  Breathes      100 non-null    int64 
 11  Venomous      100 non-null    int64 
 12  Fins          100 non-null    int64 
 13  Nlegs_0       100 non-null    int64 
 14  Nlegs_2       100 non-null    int64 
 15  Nlegs_4       100 non-null    int64 
 16  Nlegs_5       100 non-null    int64 
 17  Nlegs_6       100 non-null    int64 
 18  Nlegs_8       100 non-null    int64 
 19  Tail     

In [19]:
class TwentyQuestions():
    """
    Plays twenty questions.
    """

    def __init__(self, kn):
        self.kn = kn
        self.y = kn['Animal']
        self.X = knowledge_base.loc[:, knowledge_base.columns != 'Animal']
        self.counter = 1
        self.answers = dict()

        # Initialise the "probability" distribution over y: a uniform prior of 20 (arbitrary number) per animal.
        self.y_probdist = pd.DataFrame(self.y)
        self.y_probdist['prob'] = np.repeat(20, len(self.y))
        self.y_probdist = self.y_probdist.set_index('Animal')['prob']

    def describe_knowledge_base(self):
        print('There are {0} objects and {1} features for each object.'.format(self.y.shape[0], self.X.shape[1]))


    # ====================================================
    # The following methods are for sampling the feature to ask about in each stage of the game, based on the
    # split cardinality ratio.
    # ====================================================

    def get_distinguishing_feats(self):
        """
        Ranks the features in X in ascending order of abs(1-SCR) and filters out those that contain either all 0s or all 1s
        (i.e. those that cannot be used to distinguish between objects).

        Arg:
        Returns:
            A pandas series of features ranked by abs(1-SCR) ascending, with non-distinguishing features removed.
        """
        # Rank the features, drop the NaNs that were put there by dist_from_1(), and return what remains.
        ranked = self.rank_features()
        distinguishing_feats = ranked.dropna()
        return distinguishing_feats

    def rank_features(self):
        """
        Ranks all features in df by their increasing absolute distance from 1 of the SCR.

        Arg:
        Returns:
            A pandas series of features ranked by abs(1-SCR) ascending
        """
        return self.X.apply(self.dist_from_1).sort_values()

    def dist_from_1(self, feat_col):
        """
        Returns the absolute distance from 1 of the split cardinality ratio for the given column of X.

        Arg:
            feat_col: a pandas series, one column in the data frame.
        Returns:
            A float if there are both 0s and 1s in the column, else np.nan
        """
        counts = feat_col.value_counts()
        if len(counts) == 2:  # i.e. if there are both 1s and 0s in the column
            ratio = counts[0] / counts[1]
            return abs( 1 - ratio )
        return np.nan  # Features that get NaNs are filtered out later :)

    def sample_feature(self, distinguishing_feats):
        """
        Ranks the features in X, creates a probability distribution from the ranking, and samples a feature
        according to this probability distribution, returning this as the feature to ask about. Also returns
        the number of distinguishing features (i.e. those that contain both 0 and 1), as a check for when to
        stop trying to split on features.

        Arg:
            distinguishing_feats: pandas series of features ranked by abs(1-SCR) ascending, with non-distinguishing
                                  features removed.
        Returns:
            sampled_feat: A string, the sampled feature to ask about.
        """

        # Get the max value of the distinguishing features (this is the final element, since they're ranked ascending).
        max_val = distinguishing_feats[-1]

        # Subtract each value in the series from max_val+1; now the features will be sorted descending, and the best features
        # to split on will have the highest values.
        # (the +1 is there because otherwise the final feature will have probability 0, and we still want it to be eligible,
        # if improbable)
        ranked_feats_transf = max_val - distinguishing_feats + 1

        # Convert to a probability distribution by dividing by the sum of all observations.
        feat_prob_dist = ranked_feats_transf / ranked_feats_transf.sum()

        # Sample one feature from this distribution and return that feature.
        sampled_feat = np.random.choice( feat_prob_dist.index, 1, p = feat_prob_dist )
        sampled_feat = str(sampled_feat[0])

        return sampled_feat


    # ====================================================
    # The following methods are for asking the user about the sampled feature, getting their answer, splitting the
    # input space accordingly, and updating the probability distribution over animals.
    # ====================================================

    def get_majority_value_and_extremeness(self, feature):
        """
        Looks at how the values are distributed in the given feature. For use in choosing whether to ask
        an unbiased question or a biased question.

        Args:
            X: pandas dataframe with features as columns, populated by 0s and 1s, one row per instance
            feature: a string, the feature we care about.
        Returns:
            majority: integer, 0 or 1, representing majority value for the given feature
            dist_from_equilibrium: float between 0 and 1, representing how out-of-balanced the values for that feature are
            (a value close to 1 means that one value completely overpowers the other; a value closer to 0 means that they
            are better balanced).
        """
        # Count the number of times 0 and 1 each appear in the column and set the more frequent one as majority.
        counts = self.X[feature].value_counts()
        majority = counts.idxmax()

        # Now compute the percentage of ones and determine how far that percentage is from a 50/50 balance. (Multiplied by
        # 2 so that the output distance is in [0, 1], not [0, 0.5] because I think that's more intuitive).
        # See interpretation in docstring.

        if len(counts) == 2: # i.e. if there are both 1s and 0s in the column
            percent_ones = counts[1] / (counts[0] + counts[1])
            dist_from_equilibrium = 2 * abs( percent_ones - 0.5 )

        else: # if only 0s or only 1s in the column; totally out of balance.
            dist_from_equilibrium = 1

        return majority, dist_from_equilibrium

    def ask_and_get_answer(self, feature, majority_val, extremeness):
        """
        Prints question about the supplied feature and gets the answer (checks validity of input).

        Args:
            feature: a string, a column in df
            majority_val: integer, 0 or 1, representing majority value for the given feature.
            extremeness: float between 0 and 1, representing how out-of-balanced the values for that feature are.
        Returns:
            integer in 0, 1, 2 representing the user's answer
        """
        # Asks and gets answer.
        self.ask_about_feature(feature, majority_val, extremeness)
        answ = int( input() )

        # Checks for bad input.
        while answ not in set([0, 1, 2]):
            print('Please give valid input (0=no, 1=yes, 2=unknown).')
            answ = int( input() )

        return answ

    def ask_about_feature(self, feat_name, majority_val, extremeness):
        """
        ANNA: Modify this function to print out a natural language question based on the feature name,
        e.g. "Does it have wings?"

        Arg:
            feat_name: string, name of feature to split dataset on
            majority_val: integer, 0 or 1, representing majority value for the given feature.
            extremeness: float between 0 and 1, representing how out-of-balanced the values for that feature are.
        Prints:
            A string, the natural language question asking about that feature.
        Returns:
            Nothing.
        """
        question = feat_name+'?'
        print('Q'+str(self.counter)+': '+question)

    def process_answer(self, feature, answ):
        """
        Splits X based on user's answer, adds the answer to the answers dictionary, and modifies the probability
        distribution over animals based on the answer.

        Args:
            feature: a string, a column in df
            answ: integer in 0, 1, 2 representing the user's answer
        Returns:
        """
        # Add answer to the answers database (even if it's unknown... Rodrigo can choose how to deal with those)
        self.answers[feature] = answ

        # If the answer is 0 or 1, split dataset, returning only those instances where the answer holds, and update
        # the probability distribution over animals accordingly.
        if answ == 0:
            self.update_animal_probdist(feature, 0)
            self.split_df_on_feature(feature, answ)
        elif answ == 1:
            self.update_animal_probdist(feature, 1)
            self.split_df_on_feature(feature, answ)

        # If the answer is 2, only remove the feature from the dataset; don't split dataset and don't update probdist.
        else:
            self.X = self.X.drop(columns=[feature])

    def split_df_on_feature(self, feature, answer):
        """
        Returns subset of df where df[feature]==answer and drops feature from columns in df.

        Args:
            feature: string, the column name to split on
            answer: int, 0 or 1, reflecting which subset of the dataframe to keep
        Returns:
            pandas dataframe with features as columns (subset of df).
        """
        self.X = self.X[self.X[feature] == answer].drop(columns=[feature])

    def update_animal_probdist(self, feature_asked, answ):
        """
        Given a user's answers to a question about a particular feature, update the probability distribution over animals.

        Args:
            feature_asked: a string, the feature just asked about
            answ: an integer, the user's response
        Returns:
        """

        # Set the index of kn to the Animal column for easy combination with the probability distribution.
        kn = self.kn.set_index('Animal')

        # Extract the column in kn corresponding to the feature we asked about.
        kn_col = pd.Series(kn[feature_asked])  # Copy this column before modifying it, so that we don't modify kn!

        # Halve current value if incompatible.
        # To do this, replace all wrong answers with 2s and correct answers by 1s,
        # and divide by kn_col (divides mismatches by 2 and matches by 1, i.e. matches stay same)
        if answ == 1:
            kn_col = np.where(kn_col == 0, 2, kn_col)
            self.y_probdist = self.y_probdist / kn_col
        elif answ == 0:
            kn_col = np.where(kn_col == 1, 2, 1)
            self.y_probdist = self.y_probdist / kn_col


    # ====================================================
    # The following methods are for once the features are exhausted. They ask about the animals in order of
    # most likely to least likely.
    # ====================================================

    def guess_objs_from_probdist(self):
        """
        To be used once the dataset cannot be split by features anymore but multiple objects still remain.
        Guesses objects in order of descending probability.

        Args:
        Returns:
            Nothing.
        """
        # Sort values descending, so the highest-probability animals are first.
        self.y_probdist.sort_values(ascending=False, inplace=True)

        # Go through animal in descending order of probability and guess.
        for animal in self.y_probdist.index:
            if self.counter <= 20:
                self.ask_about_object(animal)
                self.counter += 1
                answ = int( input() )
                if answ == 1:
                    self.endgame_win()
                    break
            else:
                #self.quick_endgame_lose()
                self.endgame_lose()
                return

    def ask_about_object(self, obj_name):
        """
        ANNA: Modify this function to print out a natural language question based on the object name,
        e.g. "Are you thinking of an ocelot?"

        Arg:
            obj_name: string, name of object to guess.
        Prints:
            A string, the natural language question guessing that object.
        Returns:
            Nothing.
        """
        question = obj_name+'?'
        print('Q'+str(self.counter)+': '+question)


    # ====================================================
    # The following functions are for the endgame: if the system guesses right, it wins. Otherwise, it loses.
    # ====================================================

    #Auxiliary functions for the endgame_lose phase:
    def binary_check(self, x):
        #checks if the array is made out of only 0s, only 1s, or a combination of both
        content = np.unique(x)
        if (len(content) == 2 and 0 in content and 1 in content) or (len(content) ==1 and 0 in content) or (len(content) ==1 and 1 in content):
            return True
        else:
            return False

    def similarity(self, x, y):
        #this function will count how many values x and y have in common

        #adding a check verifying that our input series consist of only 0s and 1s
        if self.binary_check(x) and self.binary_check(y):
            return np.sum(x==y)
        else:
            return -1    
    # ====================================================
    
    def endgame_lose(self):
        """
        TODO RODRIGO: Update this method so that it's compatible with the class variables self.kn, self.y, self.answers

        RODRIGO: If the game is lost, we will need to figure out why (was the 20Q limit reached? Or was the user's
        object not in the knowledge base?) and take action based on that. The code to add in unknown objects can
        be incorporated here.
        I've set the code up so that the answers dictionary ends up here, hopefully that makes your life easier.

        Arg:
            answers: a dictionary where the keys are features and the values are the user's answers to those features
        """
        #======================================

        #Swallowing pride
        print('dangit, you were too smart for me!')

        #======================================

        #Getting correct answer
        print('Which object were you thinking about? (please_write_it_in_this_format)')
        correct_answer = input().lower()
        #adding the correct answer to the answers dict
        self.answers[self.y.name] = correct_answer
        #print(self.answers)
        print('Smart choice!')

        #=======================================

        #Asking for more info

        #=======================================

        # If the correct answer is already in our dataset
        if correct_answer in self.y.unique():
            # If the user's answers contradict our KB we will add a new row to the KB with the new information

            #temporary array to keep the updated row
            correct_answer_index = np.where(self.y==correct_answer)[0][0]
            new_row = self.kn.iloc[[correct_answer_index]].copy()

            #update process
            for attribute, value in self.answers.items():
                if type(value) != str: #making sure to not compare the animal name
                    if (value == new_row[attribute]).bool() == False: #diff than in our KB
                                new_row[correct_answer_index, attribute] = value
            self.kn = self.kn.append(new_row, ignore_index=True)

        #if correct answer is not yet in our dataset
        else:
            #blank new row
            new_row = []

            #filling in the new row
            for i, attribute in enumerate(self.kn.columns, 0):
                if attribute in self.answers.keys(): #knowledge provided by the user
                    new_row.append(self.answers[attribute])
                else:
                    new_row.append(993993)
            
            
            #retrieving the row that is already in our KB with the highest similarity to the answers provided by the user.
            #if there is a tie, we will simply grab the values from the first row having this similarity maximum value.

            #we convert to np array and delete the first value with the string 'Animal'
            rows = [np.asarray(self.kn.iloc[i].copy())[1:] for i in range(self.kn.shape[0])] 

            #here we store the similarity counts between our new row and every other row in our KB
            sim_counts = [self.similarity(rows[i], new_row) for i in range(len(rows))]

            #retrieving the row index corresponding to the animal with the highest similarity and retrieving that row
            most_similar_index = np.argmax(sim_counts)
            most_similar_row = self.kn.iloc[most_similar_index].copy()
            
            #second round filling in the new row with the missing features coming from the most similar existing row
            new_row = []
            
            for i, attribute in enumerate(self.kn.columns, 0):
                if attribute in self.answers.keys(): #knowledge provided by the user
                    new_row.append(self.answers[attribute])
                else:
                    #for the features that were not provided by the user we will use our similarity measure to interpolate the missing values from the most similar row.
                    new_row.append(most_similar_row[i])
            
            #adding it to the KN
            final = dict()
            for i, at in enumerate(self.kn.columns, 0):
                final[at] = new_row[i]
            self.kn = self.kn.append(final, ignore_index=True)

    
    def endgame_win(self):
        """
        RODRIGO: Does something in the event that the game was won.
        """
        print('oh yeah! I rock')

    def quick_endgame_lose(self):
        print('dangit')


    # ====================================================
    # Finally, the following function is a recursive function that plays the game.
    # ====================================================

    def play(self):
        """
        Recursively bisects knowledge base based on user input about whether target object matches the feature.
        Guesses animals in order of their descending probability, given the user's answers.

        Args:
        Returns:
            nothing.
        """

        # -----------------------------
        # BASE CASE 0: counter > 20
        # -----------------------------
        if self.counter > 20:
            print('TOO MANY QUESTIONS!')
            #self.quick_endgame_lose()
            self.endgame_lose(self)
            return

        # -----------------------------
        # BASE CASE 1: Only one row left in the data, so only one object compatible with all the answers thus far.
        # Guess it (at top of probdist) and further objects in order of decreasing probability.
        # -----------------------------

        if len(self.X) == 1:
            print('ONLY ONE OBJECT LEFT!')
            self.guess_objs_from_probdist()  # includes endgame
            return

        # -----------------------------
        # BASE CASE 2: Only one feature left in the data (have asked about all other ones). Will need to ask about that feature,
        # subset the data correspondingly, and then go through all remaining objects in descending order of probability.
        # -----------------------------

        if len(self.X.columns) == 1:
            print('ONLY ONE FEATURE LEFT!')
            feature_to_split_on = self.X.columns[0]
            majority_val, extremeness = self.get_majority_value_and_extremeness(feature_to_split_on)
            answ = self.ask_and_get_answer(feature_to_split_on, majority_val, extremeness)
            self.process_answer(feature_to_split_on, answ)
            self.counter += 1

            # If there are no remaining objects to guess after splitting the data on this feature, then endgame_lose().
            if len(self.X.index) == 0:
                print('NO OBJECTS LEFT TO GUESS!')
                #self.quick_endgame_lose()
                self.endgame_lose(self)
                return
            # Otherwise, cycle through all remaining objects until endgame.
            else:
                self.guess_objs_from_probdist()  # includes endgame
                return

        # -----------------------------
        # BASE CASE 3: There are no more distinguishing features, so the dataset can't be divided anymore.
        # Will just need to cycle through all remaining objects until endgame.
        # -----------------------------

        disting_feats = self.get_distinguishing_feats()

        # Count the distinguishing features in X (i.e. those that aren't all 0s or all 1s) and cycle through objects
        # if there are none.
        if len( disting_feats ) == 0:
            print('NO MORE DISTINGUISHING FEATURES!')
            self.guess_objs_from_probdist()  # includes endgame
            return

        # -----------------------------
        # RECURSIVE CASE: If we get this far, that means we didn't fall into any of the base cases, so the game can be played!
        # -----------------------------

        # Sample a feature disting_feats proportional to how well it splits the data in X and ask about it.
        feature_to_split_on = self.sample_feature(disting_feats)
        majority_val, extremeness = self.get_majority_value_and_extremeness(feature_to_split_on)
        answ = self.ask_and_get_answer(feature_to_split_on, majority_val, extremeness)
        self.process_answer(feature_to_split_on, answ)
        self.counter += 1

        self.play()

In [20]:
twq = TwentyQuestions(knowledge_base)
twq.play()

Q1: Milk?


 0


Q2: Nlegs_6?


 0


Q3: Nlegs_2?


 0


Q4: Fins?


 0


Q5: Aquatic?


 0


Q6: Eggs?


 0


ONLY ONE OBJECT LEFT!
Q7: scorpion?


 0


Q8: mongoose?


 0


Q9: reindeer?


 0


Q10: pussycat?


 0


Q11: puma?


 0


Q12: pony?


 0


Q13: polecat?


 0


Q14: pitviper?


 0


Q15: oryx?


 0


Q16: opossum?


 0


Q17: worm?


 0


Q18: mole?


 0


Q19: lynx?


 0


Q20: lion?


 0


dangit, you were too smart for me!
Which object were you thinking about? (please_write_it_in_this_format)


 rodrigo_lopez


Smart choice!


In [None]:
# replacing all 2s with 0s just to check if that's crashing our code
kbtest = knowledge_base.replace(2, 0)
twq_test = TwentyQuestions(kbtest)
twq_test.play()

In [21]:
twq.kn

Unnamed: 0,Animal,Hair,Feathers,Eggs,Milk,Airborne,Aquatic,Predator,Toothed,Backbone,...,Tail,Domestic,Catsize,Mammal,Bird,Reptile,Fish,Amphibian,Insect,Invertebrate
0,aardvark,1,0,0,1,0,0,1,1,1,...,0,0,1,1,0,0,0,0,0,0
1,antelope,1,0,0,1,0,0,0,1,1,...,1,0,1,1,0,0,0,0,0,0
2,bass,0,0,1,0,0,1,1,1,1,...,1,0,0,0,0,0,1,0,0,0
3,bear,1,0,0,1,0,0,1,1,1,...,0,0,1,1,0,0,0,0,0,0
4,boar,1,0,0,1,0,0,1,1,1,...,1,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,wasp,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
97,wolf,1,0,0,1,0,0,1,1,1,...,1,0,1,1,0,0,0,0,0,0
98,worm,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
99,wren,0,1,1,0,1,0,0,0,1,...,1,0,0,0,1,0,0,0,0,0
