In [None]:
import numpy as np
import pandas as pd

def importdata(path='ENJOYSPORT.csv'):
    data = pd.read_csv(path)
    return data

print(importdata())




     Sky AirTemp Humidity    Wind Water Forecast  EnjoySport
0  Sunny    Warm   Normal  Strong  Warm     Same           1
1  Sunny    Warm     High  Strong  Warm     Same           1
2  Rainy    Cold     High  Strong  Warm   Change           0
3  Sunny    Warm     High  Strong  Cool   Change           1


In [None]:
# Function to calculate entropy
def calculate_entropy(column):
    # Get the counts of each unique value in the column
    value_counts = column.value_counts()

    # Calculate probabilities
    probabilities = value_counts / len(column)

    # Calculate entropy
    entropy = -np.sum(probabilities * np.log2(probabilities))

    return entropy
data = importdata()

# Assuming 'data' is your DataFrame and 'EnjoySport' is the target column
entropy_enjoysport = calculate_entropy(data['EnjoySport'])
print(f"Entropy of EnjoySport: {entropy_enjoysport}")


Entropy of EnjoySport: 0.8112781244591328


In [None]:
# Information Gain calculation function
def calculate_information_gain(data, feature, target):
    # Calculate the total entropy of the target before splitting
    total_entropy = calculate_entropy(data[target])

    # Get the unique values and their counts for the feature
    values, counts = np.unique(data[feature], return_counts=True)

    # Calculate the weighted entropy after splitting by the feature
    weighted_entropy = 0
    for i, value in enumerate(values):
        subset = data[data[feature] == value][target]
        subset_entropy = calculate_entropy(subset)
        weight = counts[i] / len(data[feature])
        weighted_entropy += weight * subset_entropy

    # Calculate the information gain
    info_gain = total_entropy - weighted_entropy
    return info_gain

# Example: Calculate Information Gain for the 'Sky' feature
info_gain_sky = calculate_information_gain(data, 'Sky', 'EnjoySport')
print(f"Information Gain for 'Sky': {info_gain_sky}")

# Similarly, you can calculate information gain for other features
info_gain_airtemp = calculate_information_gain(data, 'AirTemp', 'EnjoySport')
info_gain_humidity = calculate_information_gain(data, 'Humidity', 'EnjoySport')

print(f"Information Gain for 'AirTemp': {info_gain_airtemp}")
print(f"Information Gain for 'Humidity': {info_gain_humidity}")


Information Gain for 'Sky': 0.8112781244591328
Information Gain for 'AirTemp': 0.8112781244591328
Information Gain for 'Humidity': 0.12255624891826566


In [None]:
class DecisionTreeID3:
    def __init__(self):
        self.tree = None

    # Function to calculate entropy
    def calculate_entropy(self, column):
        value_counts = column.value_counts()
        probabilities = value_counts / len(column)
        entropy = -np.sum(probabilities * np.log2(probabilities + 1e-9))  # Avoid log(0)
        return entropy

    # Function to build the decision tree in a predefined order
    def build_tree_predefined(self, data, feature_order, target):
        # Base case 1: If all target values are the same, return that value
        if len(np.unique(data[target])) == 1:
            return np.unique(data[target])[0]

        # Base case 2: If there are no more features to split on, return the majority value
        elif len(feature_order) == 0:
            return data[target].mode()[0]

        else:
            # Use the first feature in the predefined feature order
            best_feature = feature_order[0]

            # Create the root node for the tree
            tree = {best_feature: {}}

            # Remove the feature that was just used from the list of features
            remaining_features = feature_order[1:]

            # Split the dataset by the best feature and recursively build the tree
            for value in np.unique(data[best_feature]):
                subset = data[data[best_feature] == value]
                subtree = self.build_tree_predefined(subset, remaining_features, target)
                tree[best_feature][value] = subtree

            return tree

    # Fit the model with a predefined feature order
    def fit_predefined(self, data, feature_order, target):
        self.tree = self.build_tree_predefined(data, feature_order, target)

    # Print the decision tree in the format you desire
    def print_tree(self, tree=None, indent="  "):
        if tree is None:
            tree = self.tree
        if isinstance(tree, dict):
            for key, value in tree.items():
                print(f"{indent}{key}")
                self.print_tree(value, indent + "  ")
        else:
            print(f"{indent}-> {tree}")

# Example usage:

# Create the dataset
data = pd.DataFrame({
    'Sky': ['Sunny', 'Sunny', 'Rainy', 'Rainy'],
    'AirTemp': ['Warm', 'Warm', 'Warm', 'Cold'],
    'EnjoySport': [1, 1, 1, 0]
})

# Initialize the Decision Tree
tree_model = DecisionTreeID3()

# Define the feature order (first split by 'Sky', then by 'AirTemp')
feature_order = ['Sky', 'AirTemp']

# Fit the tree using the predefined feature order
tree_model.fit_predefined(data, feature_order, 'EnjoySport')

# Print the decision tree
tree_model.print_tree()


  Sky
    Rainy
      AirTemp
        Cold
          -> 0
        Warm
          -> 1
    Sunny
      -> 1
