## Decision Tree

In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from collections import Counter

## Load the Iris dataset

In [5]:
iris = load_iris()
data = iris.data
target = iris.target
feature_names = iris.feature_names

## Function to calculate entropy

In [6]:
def entropy(y):
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])

## Function to calculate information gain

In [7]:
def information_gain(X, y, feature):
    # Calculate original entropy
    original_entropy = entropy(y)
    
    # Split the data
    values = set(X[:, feature])
    weighted_entropy = 0
    for v in values:
        sub_y = y[X[:, feature] == v]
        weighted_entropy += (len(sub_y) / len(y)) * entropy(sub_y)
    
    # Information Gain
    gain = original_entropy - weighted_entropy
    return gain

## Function to print decision tree steps

In [8]:
def print_tree(X, y, feature_names, level=0):
    if len(set(y)) == 1:
        print(f"Level {level}")
        print(f"Count of {y[0]} = {len(y)}")
        print(f"Current Entropy is = {entropy(y):.6f}")
        print("Reached leaf Node")
        return

    # Calculate entropy
    current_entropy = entropy(y)
    # Find the best feature to split
    gains = [information_gain(X, y, f) for f in range(X.shape[1])]
    best_feature = np.argmax(gains)
    best_gain = gains[best_feature]
    
    # Print current state
    counts = Counter(y)
    print(f"Level {level}")
    for class_val, count in counts.items():
        print(f"Count of {class_val} = {count}")
    print(f"Current Entropy is = {current_entropy:.6f}")
    print(f"Splitting on feature {feature_names[best_feature]} with gain ratio {best_gain:.6f}")
    
    # Split the dataset
    values = set(X[:, best_feature])
    for v in values:
        sub_X = X[X[:, best_feature] == v]
        sub_y = y[X[:, best_feature] == v]
        print_tree(sub_X, sub_y, feature_names, level + 1)

# Run the decision tree and print steps
print_tree(data, target, feature_names)

Level 0
Count of 0 = 50
Count of 1 = 50
Count of 2 = 50
Current Entropy is = 1.584963
Splitting on feature petal length (cm) with gain ratio 1.446317
Level 1
Count of 0 = 4
Current Entropy is = -0.000000
Reached leaf Node
Level 1
Count of 0 = 13
Current Entropy is = -0.000000
Reached leaf Node
Level 1
Count of 0 = 7
Current Entropy is = -0.000000
Reached leaf Node
Level 1
Count of 0 = 7
Current Entropy is = -0.000000
Reached leaf Node
Level 1
Count of 0 = 13
Current Entropy is = -0.000000
Reached leaf Node
Level 1
Count of 0 = 1
Current Entropy is = -0.000000
Reached leaf Node
Level 1
Count of 0 = 2
Current Entropy is = -0.000000
Reached leaf Node
Level 1
Count of 0 = 1
Current Entropy is = -0.000000
Reached leaf Node
Level 1
Count of 0 = 2
Current Entropy is = -0.000000
Reached leaf Node
Level 1
Count of 1 = 5
Current Entropy is = -0.000000
Reached leaf Node
Level 1
Count of 1 = 7
Count of 2 = 1
Current Entropy is = 0.543564
Splitting on feature sepal length (cm) with gain ratio 0.543

## import libraries

In [37]:

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
!pip install graphviz scikit-learn
import graphviz
import pydotplus



## fitting the data

In [38]:
cl=DecisionTreeClassifier()
cl.fit(data,target)

## exporting data

In [39]:
!pip install pydotplus scikit-learn



In [None]:
import pydotplus
dot_data = export_graphviz(cl, out_file=None, 
                           feature_names=iris.feature_names,  
                           class_names=iris.target_names,  
                           filled=True, rounded=True,) 
graph=pydotplus.graph_from_dot_data(dot_data)

graph.write_pdf("iris1.pdf")