In [2]:
import pandas as pd
import numpy as np

X = pd.read_csv('dataforCE.csv')
y = X['buy']
X = X.drop('buy', axis=1)
print(X)
print()
print(y)

  citations    size inLibrary       price editions
0      some   small        no  affordable     many
1      many     big        no   expensive      one
2      some     big    always   expensive      few
3      many  medium        no   expensive     many
4      many   small        no  affordable     many

0     no
1    yes
2     no
3    yes
4    yes
Name: buy, dtype: object


In [7]:
def consistent(h, d):
    # h is hypothesis needs to be checked consistent with the other 'd' hypothesis/training example    
    for i in range(len(h)):
        if h[i] == '?':
            continue    
        if h[i] != d[i]:
            return False
    return True
            
def generalizeHypothesis(inconsistentHypothesis, d):
    general_hypothesis = inconsistentHypothesis.copy()
    if all(x is None for x in general_hypothesis):
        general_hypothesis.clear()
        general_hypothesis = d.copy()
    else:
        for i in range(len(general_hypothesis)):
            if general_hypothesis[i] == '?':
                continue    
            if general_hypothesis[i] != d[i]:
                general_hypothesis[i] = '?'
    return general_hypothesis

def specializeHypothesis(hypothesis, d, attribute_values):
    specializations = []
    for i in range(len(hypothesis)):
        if hypothesis[i] == '?':  
            for val in attribute_values[i]:
                if val != d[i]:
                    new_hypothesis = hypothesis.copy()
                    new_hypothesis[i] = val
                    specializations.append(new_hypothesis)
        elif hypothesis[i] == d[i]:  
            continue
    return specializations
    
# Main Function 
General_Boundary = [['?', '?', '?', '?', '?']]
Specific_Boundary = [[None, None, None, None, None]]

attribute_values = [
    {"some", "many"},
    {"small", "medium", "big"},
    {"no", "always"},
    {"affordable", "expensive"},
    {"many", "one", "few"}
]

for idx in range(X.shape[0]):
    # d is a training example in X's dataframe.
    d = X.iloc[idx].tolist()

    # d is a positive example
    if y[idx] == 'yes':
        # Remove from G any hypothesis inconsistent with d
        pop_indices = []
        for i in range(len(General_Boundary)):
            if not consistent(General_Boundary[i], d):
                pop_indices.append(i)
        for i in reversed(pop_indices):
            General_Boundary.pop(i)
        
        # For each hypothesis in S that is not consistent with d
        pop_indices2 = []
        for i in range(len(Specific_Boundary)):
            if not consistent(Specific_Boundary[i], d):
                pop_indices2.append(i)

                # Add to S all minimal generalizations h of s such that (h is consistent with d)
                hypothesis = generalizeHypothesis(Specific_Boundary[i], d)
                Specific_Boundary.append(hypothesis)

        for i in reversed(pop_indices2):
            Specific_Boundary.pop(i)

    # d is a negative example
    else:
        # Remove from S any hypothesis inconsistent with d
        pop_indices3 = []
        for i in range(len(Specific_Boundary)):
            if consistent(Specific_Boundary[i], d):
                pop_indices3.append(i)
        for i in reversed(pop_indices3):
            Specific_Boundary.pop(i)

        # For each hypothesis in G that is not consistent with d
        pop_indices4 = []
        new_specializations = []
        for i in range(len(General_Boundary)):
            if consistent(General_Boundary[i], d):
                pop_indices4.append(i)
                
                hypothesis = specializeHypothesis(General_Boundary[i], d, attribute_values)
                new_specializations.extend(hypothesis)

        # Remove inconsistent hypotheses from G 
        for i in reversed(pop_indices4):
            General_Boundary.pop(i)

        # Add minimal specializations to G
        for h in new_specializations:
            should_add = True
            for s in Specific_Boundary:
                if consistent(s, h):
                    should_add = False
                    break
            if should_add:
                General_Boundary.append(h)

        # Remove from G any hypothesis that is less general than another hypothesis in G
        pop_indices5 = []
        for i in range(len(General_Boundary)):
            for j in range(len(General_Boundary)):
                if i != j and consistent(General_Boundary[j], General_Boundary[i]):
                    pop_indices5.append(i)
        for i in reversed(pop_indices5):
            General_Boundary.pop(i)

print("Final Specific Boundary: ", Specific_Boundary)
print("Final General Boundary: ", General_Boundary)

Final Specific Boundary:  [['many', '?', 'no', '?', '?']]
Final General Boundary:  [['many', '?', '?', '?', '?']]


In [12]:
# Generating Version Space
from itertools import product
def generate_version_space(S, G):
    S = [s for s in S]
    G = [g for g in G]
    # Empty version space
    version_space = set()
    # For each specific and general boundary hypothesis
    for s in S:
        for g in G:
            # Including all domain values of attributes
            attr_domains = []
            for i in range(len(s)):
                if s[i] == g[i]:
                    attr_domains.append(s[i])
                elif s[i] == '?':
                    attr_domains.append(['?'])
                else:
                    attr_domains.append([s[i], '?'])
            # Extract nested lists and their indices
            nested_indices = [i for i, x in enumerate(attr_domains) if isinstance(x, list)]
            nested_lists = [attr_domains[i] for i in nested_indices]
            # Generate all possible combinations
            combinations = list(product(*nested_lists))
            for combo in combinations:
                temp_list = attr_domains.copy()
                for idx, value in zip(nested_indices, combo):
                    temp_list[idx] = value
                version_space.add(tuple(temp_list))
    return version_space

In [14]:
Version_Space = generate_version_space(Specific_Boundary,General_Boundary)
print("Version Space: ", Version_Space)

Version Space:  {('many', '?', '?', '?', '?'), ('many', '?', 'no', '?', '?')}
