In [41]:
import numpy as np
import pandas as pd

In [42]:
def load_data(filename, target):
    dataset = pd.read_csv(filename, sep=",")  # read .csv file into dataset variable

    print(dataset,"\n")

    x = np.array(dataset.drop([target],1)) # x contains all the features. Does not include target
    y = np.array(dataset[target]) # y contains the target class 

    print("\nX = \n",x)
    print("\nY = \n",y)

    print("\n\n")
    
    return (dataset,x,y,target)

inp = load_data("EconomyCar.csv","Likeable")
print(inp[0])

  Country Manufacturer  Color  Year     Type Likeable
0  Japan         Honda  Blue   1980  Economy      Yes
1  Japan        Toyota  Green  1970   Sports       No
2  Japan        Toyota  Blue   1990  Economy      Yes
3     USA     Chrysler    Red  1980  Economy       No
4  Japan         Honda  White  1980  Economy      Yes 


X = 
 [['Japan ' 'Honda' 'Blue ' 1980 'Economy']
 ['Japan ' 'Toyota' 'Green' 1970 'Sports']
 ['Japan ' 'Toyota' 'Blue ' 1990 'Economy']
 ['USA' 'Chrysler' 'Red' 1980 'Economy']
 ['Japan ' 'Honda' 'White' 1980 'Economy']]

Y = 
 ['Yes' 'No' 'Yes' 'No' 'Yes']



  Country Manufacturer  Color  Year     Type Likeable
0  Japan         Honda  Blue   1980  Economy      Yes
1  Japan        Toyota  Green  1970   Sports       No
2  Japan        Toyota  Blue   1990  Economy      Yes
3     USA     Chrysler    Red  1980  Economy       No
4  Japan         Honda  White  1980  Economy      Yes


In [43]:
def convert_data(dataset,x,y,target):
    header_names = []
    for i in range(len(dataset.columns)):
        if dataset.columns[i] != target:
            header_names.append(dataset.columns[i])
    print("Header Names: ",tuple(header_names))
    
    #print("srihari",len(dataset))
    data = []
    for i in range(len(dataset)):
        new = [(tuple(x[i])),y[i]]
        data.append(tuple(new))
    
    print("\nDataset: ",data)
    
    return (data, header_names)
    
    
data_inp = convert_data(inp[0],inp[1],inp[2],inp[3])

Header Names:  ('Country', 'Manufacturer', 'Color', 'Year', 'Type')

Dataset:  [(('Japan ', 'Honda', 'Blue ', 1980, 'Economy'), 'Yes'), (('Japan ', 'Toyota', 'Green', 1970, 'Sports'), 'No'), (('Japan ', 'Toyota', 'Blue ', 1990, 'Economy'), 'Yes'), (('USA', 'Chrysler', 'Red', 1980, 'Economy'), 'No'), (('Japan ', 'Honda', 'White', 1980, 'Economy'), 'Yes')]


In [44]:
class holder():
    factors = {}
    attributes = ()
    
    def __init__(self,attr):
        self.attributes = attr
        for i in attr:
            self.factors[i] = []
            
    def add_values(self,data,header_names):
        for i in range(len(data)):
            for j in range(len(header_names)):
                # 3dimensions as 2 tuples in data
                if data[i][0][j] not in self.factors[header_names[j]]:
                    self.factors[header_names[j]].append(data[i][0][j])

f = holder(data_inp[1])
f.add_values(data_inp[0],data_inp[1])
print("\nFactors: ",f.factors)
print("\nAttributes: ",f.attributes)


Factors:  {'Country': ['Japan ', 'USA'], 'Manufacturer': ['Honda', 'Toyota', 'Chrysler'], 'Color': ['Blue ', 'Green', 'Red', 'White'], 'Year': [1980, 1970, 1990], 'Type': ['Economy', 'Sports']}

Attributes:  ['Country', 'Manufacturer', 'Color', 'Year', 'Type']


In [45]:
class CandidateElimination():
    Positive = {}
    Negative = {}
    
    def __init__(self,data,fact):
        self.num_factors = len(data[0][0])
        self.factors = fact.factors
        self.attr = fact.attributes
        self.dataset = data
        

    def run_algorithm(self):
        G = self.initializeG()
        #print(type(G[0]))
        S = self.initializeS()
        
        count = 0
        for trial_set in self.dataset:  # check for every input
            count+=1
            print("\nInput %d: "%(count),trial_set)
            if self.is_positive(trial_set):  
                print("Positive")
                #print(trial_set[0])
                G = self.remove_inconsistent_G(G,trial_set[0])
                S_new = S[:]
                #print("S_new: ",S_new)
                for s in S:
                    
                    if not self.consistent(s,trial_set[0]):
                        S_new.remove(s)
                        #print(trial_set[0])
                        generalization = self.generalize_inconsistent_S(s, trial_set[0])
                        generalization = self.get_general(generalization,G)
                        
                        if generalization: 
                            S_new.append(generalization)
                    S = S_new[:]
                    S = self.remove_more_general(S)
                print("S%d: "%(count),S)
                print("G%d: "%(count),G)
                    
                    
            else:   # if negative input
                print("Negative")
                S = self.remove_inconsistent_S(S, trial_set[0])
                G_new = G[:]
                for g in G:
                    if self.consistent(g,trial_set[0]):
                        G_new.remove(g)
                        specialization = self.specialize_inconsistent_G(g, trial_set[0])
                        #print("srihari",specialization)
                        specialization = self.get_specific(specialization, S)
                        if specialization != []:
                            G_new += specialization
                    G = G_new[:]
                    G = self.remove_more_specific(G)
                print("S%d: "%(count),S)
                print("G%d: "%(count),G)

        print("\nFinal S: ", S)
        print("Final G: ",G)
                
        
    def initializeG(self):
        G = tuple(['?' for i in range(self.num_factors)])
        return [G]
    
    def initializeS(self):
        S = tuple(['-' for i in range(self.num_factors)])
        return [S]
    
    def is_positive(self,trial_set):
        if trial_set[1] == 'Yes' or trial_set[1] == 1:
            return True
        elif trial_set[1] == 'No' or trial_set[1] == 0:
            return False
        else:
            raise TypeError("Invalid Target Value")
            
    def remove_inconsistent_G(self,hypothesis,instance):
        G_new = hypothesis[:]
        for g in hypothesis:
            if not self.consistent(g,instance):
                G_new.remove(g)              
        return G_new
    
    def remove_inconsistent_S(self,hypothesis,instance):
        S_new = hypothesis[:]
        for s in hypothesis:
            if self.consistent(s,instance):
                S_new.remove(s)  
        return S_new
    
    
    def consistent(self, hypothesis, instance):
        for i,factor in enumerate(hypothesis):
            # enumerate returns index,val
            if not self.match_factor(factor, instance[i]):
                return False
        return True
            
    def match_factor(self, val1, val2):
        if val1 == '?' or val2 == '?':
            return True
        elif val1 == val2:
            return True
        return False
    
    def generalize_inconsistent_S(self, hypothesis,instance):
        hypo = list(hypothesis)
        for i,factor in enumerate(hypo):
            if factor == '-':
                hypo[i] = instance[i]
            elif not self.match_factor(factor,instance[i]):
                hypo[i] = '?'
        return tuple(hypo)
    
    def specialize_inconsistent_G(self, hypothesis,instance):
        specializations = []
        hypo = list(hypothesis)
        for i,factor in enumerate(hypo):
            if factor == '?':
                values = self.factors[self.attr[i]]
                for j in values:
                    if instance[i] != j:
                        hyp = hypo[:]
                        hyp[i] = j
                        hyp=tuple(hyp)                        
                        specializations.append(hyp)
        return specializations
    
    def get_general(self, generalization, G):
        for g in G:
            if self.more_general(g,generalization):
                return generalization
        return None
    
    def get_specific(self, specializations, S):
        valid_specialization = []
        for hypo in specializations:
            for s in S:
                if self.more_specific(s,hypo) or s==self.initializeS()[0]:
                    valid_specialization.append(hypo)
        return valid_specialization
    
    def more_general(self, hypo1, hypo2):
        hyp = zip(hypo1,hypo2)
        for i,j in hyp:
            if i == '?':
                continue
            elif j == '?' and i != '?':
                return False
            elif i != j:
                return False
            else:
                continue
        return True
    
    def more_specific(self,hypo1,hypo2):
        return self.more_general(hypo2, hypo1)
        
    def remove_more_general(self, hypotheses):
        S_new = hypotheses[:]
        for old in hypotheses:
            for new in S_new:
                if old != new and self.more_general(new,old):
                    S_new.remove[new]
                    
        return S_new
    
    def remove_more_specific(self, hypotheses):
        G_new = hypotheses[:]
        for old in hypotheses:
            for new in G_new:
                if old != new and self.more_specific(new,old):
                    G_new.remove(new)
                    
        return G_new
            
#print(f)
output = CandidateElimination(data_inp[0],f)
output.run_algorithm()


Input 1:  (('Japan ', 'Honda', 'Blue ', 1980, 'Economy'), 'Yes')
Positive
S1:  [('Japan ', 'Honda', 'Blue ', 1980, 'Economy')]
G1:  [('?', '?', '?', '?', '?')]

Input 2:  (('Japan ', 'Toyota', 'Green', 1970, 'Sports'), 'No')
Negative
S2:  [('Japan ', 'Honda', 'Blue ', 1980, 'Economy')]
G2:  [('?', 'Honda', '?', '?', '?'), ('?', '?', 'Blue ', '?', '?'), ('?', '?', '?', 1980, '?'), ('?', '?', '?', '?', 'Economy')]

Input 3:  (('Japan ', 'Toyota', 'Blue ', 1990, 'Economy'), 'Yes')
Positive
S3:  [('Japan ', '?', 'Blue ', '?', 'Economy')]
G3:  [('?', '?', 'Blue ', '?', '?'), ('?', '?', '?', '?', 'Economy')]

Input 4:  (('USA', 'Chrysler', 'Red', 1980, 'Economy'), 'No')
Negative
S4:  [('Japan ', '?', 'Blue ', '?', 'Economy')]
G4:  [('?', '?', 'Blue ', '?', '?'), ('Japan ', '?', '?', '?', 'Economy')]

Input 5:  (('Japan ', 'Honda', 'White', 1980, 'Economy'), 'Yes')
Positive
S5:  [('Japan ', '?', '?', '?', 'Economy')]
G5:  [('Japan ', '?', '?', '?', 'Economy')]

Final S:  [('Japan ', '?', '?'