<a href="https://colab.research.google.com/github/SankethHanasi/6thSem-ML-Lab/blob/main/1BM22CS242_Lab2_ID3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Sample weather dataset
data = {
    'Day': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Rainy', 'Overcast', 'Sunny', 'Sunny', 'Rainy', 'Sunny', 'Overcast', 'Overcast', 'Rainy'],
    'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild', 'Hot', 'Mild'],
    'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'High'],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Decision': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Function to calculate entropy
def entropy(target):
    # Get the counts of each class
    class_counts = target.value_counts()
    # Calculate the entropy using the formula
    probabilities = class_counts / len(target)
    return -np.sum(probabilities * np.log2(probabilities))

# Function to calculate information gain
def information_gain(data, feature, target):
    # Calculate the entropy of the whole dataset
    entropy_before = entropy(target)

    # Get the unique values of the feature
    feature_values = data[feature].unique()

    # Calculate the weighted entropy after the split
    weighted_entropy = 0
    for value in feature_values:
        subset = target[data[feature] == value]
        weighted_entropy += (len(subset) / len(target)) * entropy(subset)

    # Information gain is the reduction in entropy
    return entropy_before - weighted_entropy

# Function to print entropy and information gain for each feature
def print_entropy_and_gain(data, features, target):
    print("\nEntropy and Information Gain for each feature:")
    for feature in features:
        gain = information_gain(data, feature, target)
        ent = entropy(target)
        print(f"Feature: {feature} | Entropy: {ent:.4f} | Information Gain: {gain:.4f}")

# Function to build the decision tree recursively
def build_tree(data, target, features):
    # Base case: If all target values are the same, return a leaf node
    if len(target.unique()) == 1:
        return target.iloc[0]

    # Base case: If no features left to split, return the majority class
    if len(features) == 0:
        return target.mode()[0]

    # Calculate information gain for each feature
    gains = {feature: information_gain(data, feature, target) for feature in features}

    # Find the feature with the highest information gain
    best_feature = max(gains, key=gains.get)

    # Create the tree node with the best feature
    tree = {best_feature: {}}

    # Get the unique values of the best feature
    feature_values = data[best_feature].unique()

    # Recursively build the tree for each subset of the data
    for value in feature_values:
        subset_data = data[data[best_feature] == value]
        subset_target = target[data[best_feature] == value]

        # Remove the best feature from the list of features for the next level
        remaining_features = [f for f in features if f != best_feature]

        # Build the subtree for the subset
        subtree = build_tree(subset_data, subset_target, remaining_features)

        # Add the subtree to the tree
        tree[best_feature][value] = subtree

    return tree

# Function to print the tree in a visually structured way
def print_tree(tree, indent=""):
    if isinstance(tree, dict):
        for feature, branches in tree.items():
            print(f"{indent}{feature}:")
            for value, subtree in branches.items():
                print(f"{indent}  {value} ->", end=" ")
                print_tree(subtree, indent + "    ")
    else:
        print(f"{indent}{tree}")

# Target variable
target = df['Decision']

# Features
features = ['Outlook', 'Temperature', 'Humidity', 'Wind']

# Step 1: Print entropy and information gain for each feature
print_entropy_and_gain(df, features, target)

# Step 2: Build the decision tree
tree = build_tree(df, target, features)

# Step 3: Print the decision tree (formatted)
print("\nDecision Tree:")
print_tree(tree, indent="    ")



Entropy and Information Gain for each feature:
Feature: Outlook | Entropy: 0.9403 | Information Gain: 0.2467
Feature: Temperature | Entropy: 0.9403 | Information Gain: 0.0292
Feature: Humidity | Entropy: 0.9403 | Information Gain: 0.1518
Feature: Wind | Entropy: 0.9403 | Information Gain: 0.0481

Decision Tree:
    Outlook:
      Sunny ->         Humidity:
          High ->             No
          Normal ->             Yes
      Overcast ->         Yes
      Rainy ->         Wind:
          Weak ->             Yes
          Strong ->             No
