# Find-S Algorithm

The Find-S algorithm is used to find the most specific hypothesis of a given dataset. The most specific hypothesis can be defined as a pattern drawn by only considering positive examples of the dataset.

In [1]:
import pandas as pd
import numpy as np
import itertools

In [2]:
data = pd.read_csv('enjoysport.csv')

This dataset consists of seven attributes including the output. 

In [3]:
display(data)

Unnamed: 0,sky,air temp,humidity,wind,water,forecast,enjoy sport
0,sunny,warm,normal,strong,warm,same,yes
1,sunny,warm,high,strong,warm,same,yes
2,rainy,cold,high,strong,warm,change,no
3,sunny,warm,high,strong,cool,change,yes


In [4]:
#making an array of all attributes by excluding the output column.
concepts = np.array(data)[:,:-1]

In [5]:
print(concepts)

[['sunny' 'warm' 'normal' 'strong' 'warm' 'same']
 ['sunny' 'warm' 'high' 'strong' 'warm' 'same']
 ['rainy' 'cold' 'high' 'strong' 'warm' 'change']
 ['sunny' 'warm' 'high' 'strong' 'cool' 'change']]


In [6]:
#getting only the output values of the dataset.
target=np.array(data)[:,-1]

In [7]:
print(target)

['yes' 'yes' 'no' 'yes']


Instantiate the variable specific_hypothesis by the first positive example.

Then for every positive example compare it with specific_hypothesis.

If an attribute does not match, replace it with ‘?’ else continue the process until the last positive example.

In [8]:
def train(con, tar):
    for i, val in enumerate(tar):
        if val == 'yes':
            specific_h = con[i].copy()
            break
            
    for i, val in enumerate(con):
        if tar[i] == 'yes':
            for x in range(len(specific_h)):
                if val[x] != specific_h[x]:
                    specific_h[x] = '?'
                else:
                    pass
    return specific_h

The final value in specific_hypothesis is the most specific hypothesis of the dataset.

In [9]:
print(train(concepts, target))

['sunny' 'warm' '?' 'strong' '?' '?']


This means that if the first four attributes of record are Sunny, Warm, High, Strong respectively then the output of that record is positive(Yes) irrespective of the last two attributes Water and forecast.

Applying LIST-THEN-ELIMINATION Algorithm to Get Hypotheses Consistent with All Training Examples

VersionSpace <-- a list containing every hypothesis in H
For each training example, <x, c(x)>
remove from VersionSpace any hypothesis h for which h(x) != c(x)
Output the list of hypotheses in VersionSpace

In [10]:
X = data.copy()

In [11]:
target = X["enjoy sport"]

In [12]:
X = X.iloc[:,:-1]

In [13]:
# Converts target to bool in type

target = target.apply(lambda x: True if x == "yes" else False)

In [14]:
class ListThenEliminate():
    """
    From all possible hypotheses space, finds Version Space containing hypotheses 
    each of which is consistent with all the training examples
    """
        
    def fit(self, X, target):
        """
        Attributes
        ----------
        X : DataFrame
            Training examples with all attributes (except target concept)
        
        target: Series
            Target concept of training examples in X

        Returns
        --------
        list
            A Version Space consisting hypotheses each of which is consistent of
            all the training examples in X
        """
        
        # Creating a space of all hypotheses possible by considering 
        # uniques values of each attribute of training examples
        
        self.__unique_attributes = [list(li) for li in list(X.apply(pd.Series.unique))]
        for li in self.__unique_attributes:
            li.append('?')
            li.append('Φ')
        self.__H = list(itertools.product(*self.__unique_attributes))
        
        # Version Space containg hypotheses each consistent of all the training examples
        self.VersionSpace = []
        
        for h in self.__H:
            if self.__is_consistent(h, (X, target)) == True:
                self.VersionSpace.append(h)
    
    def __is_consistent(self, h, D):
        """
        Checks if the hypothesis h is consistent with all the training exampes in D
        
        Attributes
        -----------
            h: list
                Hypothesis to test against D
            D: tuple
                A tuple of all training example with attributes (X: DataFrame) and their repstive concepts (c: Series)
            
        Returns
        --------
            True if hyposthesis is consistent with training examples in D, or False otherwise
        """
        
        for idx, x in D[0].iterrows():
            self.__prediction = self.__predict(h, x)
            if self.__prediction == True and D[1][idx] == False:
                return False
            if self.__prediction == False and D[1][idx] == True:
                return False
                    
        return True
    
    def __predict(self, h, x):
        """
        Predicts instance x to be positive or negative against hypothesis h
        
        Atributes
        ----------
            h: list
                Hypothesis to predict against
            x: list
                Instance to predict for
            
        Returns
        --------
            bool
                True if the hypothesis h predicts the instance positive, or False otherwise
            
        """

        for i, attr in enumerate(x):
            if h[i] == 'Φ':
                return False
            if h[i] == '?':
                continue
            if h[i] != x.iloc[i]:
                return False
            
        return True

In [15]:
# Initializes algorithm and trains with training examples

list_then_eliminate = ListThenEliminate()
list_then_eliminate.fit(X, target)

In [16]:
# Shows the version space once training is over
display(list_then_eliminate.VersionSpace)

[('sunny', 'warm', '?', 'strong', '?', '?'),
 ('sunny', 'warm', '?', '?', '?', '?'),
 ('sunny', '?', '?', 'strong', '?', '?'),
 ('sunny', '?', '?', '?', '?', '?'),
 ('?', 'warm', '?', 'strong', '?', '?'),
 ('?', 'warm', '?', '?', '?', '?')]