In [3]:
import pandas as pd
import numpy as np
from collections import Counter

# Calculate entropy
def entropy(target_column):
    elements, counts = np.unique(target_column, return_counts=True)
    entropy = -np.sum([(counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

# Calculate Information Gain
def info_gain(data, split_attribute_name, target_name="PlayTennis"):
    # Calculate the entropy of the total dataset
    total_entropy = entropy(data[target_name])

    # Calculate the values and the counts for the split attribute
    values, counts = np.unique(data[split_attribute_name], return_counts=True)

    # Calculate the weighted entropy
    weighted_entropy = np.sum([(counts[i]/np.sum(counts)) * 
                               entropy(data.where(data[split_attribute_name] == values[i]).dropna()[target_name])
                               for i in range(len(values))])

    # Calculate the information gain
    information_gain = total_entropy - weighted_entropy
    return information_gain

# Build ID3 tree
def id3(data, original_data, features, target_name="PlayTennis", parent_node_class=None):
    # If all target values have the same value, return this value
    if len(np.unique(data[target_name])) <= 1:
        return np.unique(data[target_name])[0]

    # If the dataset is empty, return the mode target feature value in the original dataset
    elif len(data) == 0:
        return np.unique(original_data[target_name])[np.argmax(np.unique(original_data[target_name], return_counts=True)[1])]

    # If the feature space is empty, return the parent node class
    elif len(features) == 0:
        return parent_node_class

    # Else, grow the tree
    else:
        # Set the parent node class to the mode target feature value of the current node
        parent_node_class = np.unique(data[target_name])[np.argmax(np.unique(data[target_name], return_counts=True)[1])]

        # Select the feature that best splits the dataset
        item_values = [info_gain(data, feature, target_name) for feature in features]  # Return the information gain values for the features
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]

        # Create the tree structure
        tree = {best_feature: {}}

        # Remove the feature with the best information gain
        features = [i for i in features if i != best_feature]

        # Grow the tree branch under the root node for each value of the best feature
        for value in np.unique(data[best_feature]):
            value = value
            sub_data = data.where(data[best_feature] == value).dropna()

            # Call the ID3 algorithm for each branch
            subtree = id3(sub_data, original_data, features, target_name, parent_node_class)

            # Add the subtree to the tree
            tree[best_feature][value] = subtree

        return tree

# Example dataset with categorical values
data = {
    "Outlook": ["Sunny", "Sunny", "Overcast", "Rain", "Rain", "Rain", "Overcast", "Sunny", "Sunny", "Rain", "Sunny", "Overcast", "Overcast", "Rain"],
    "Temperature": ["Hot", "Hot", "Hot", "Mild", "Cool", "Cool", "Cool", "Mild", "Cool", "Mild", "Mild", "Mild", "Hot", "Mild"],
    "Humidity": ["High", "High", "High", "High", "Normal", "Normal", "Normal", "High", "Normal", "Normal", "Normal", "High", "Normal", "High"],
    "Wind": ["Weak", "Strong", "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Strong"],
    "PlayTennis": ["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No"]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Display information gain for each feature
print("Information Gain for Each Feature:")
features = [column for column in df.columns if column != "PlayTennis"]
for feature in features:
    gain = info_gain(df, feature)
    print(f"{feature}: {gain:.4f}")

# Build the decision tree using ID3 algorithm
print("\nDecision Tree:")
id3_tree = id3(df, df, features)
print(id3_tree)

Information Gain for Each Feature:
Outlook: 0.2467
Temperature: 0.0292
Humidity: 0.1518
Wind: 0.0481

Decision Tree:
{'Outlook': {'Overcast': 'Yes', 'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}}, 'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}


In [4]:
# Function to display the tree in a structured format
def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + str(tree))
    else:
        for key, value in tree.items():
            print(indent + str(key))
            for sub_key, sub_value in value.items():
                print(indent + f"  {sub_key} -> ", end="")
                print_tree(sub_value, indent + "    ")

# Display the decision tree
print("\nDecision Tree Structure:")
print_tree(id3_tree)



Decision Tree Structure:
Outlook
  Overcast ->     Yes
  Rain ->     Wind
      Strong ->         No
      Weak ->         Yes
  Sunny ->     Humidity
      High ->         No
      Normal ->         Yes
