In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split 

In [2]:
iris = datasets.load_iris()
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 1)

In [3]:
# Entropy or info_required function
def entropy(x, y):
    result = 0
    total = len(y) # total no. of data points
    for current_class in set(y):
        # p is no. of rows with current_class dvided by total no of rows
        p = (y == current_class).sum()/total
        # Formula for entropy
        result += -(p * np.log(p))
    return result

In [5]:
def info_gain(x, y, features, split_feature):
    before_split = 0    # info_gain before spliting on current split_feature
    total = len(y)      # total no of data points in node before spliting
    # Iterating through all classes in a node
    for current_class in set(y):
         # p is no. of rows with current_class dvided by total no of rows
        p = (y == current_class).sum() / total
        before_split += -(p * np.log(p))
     # info_gain after spliting on current split_feature   
    after_split = 0     
    #spliting data set into group acc.to split_feature
    for current_value in set(x[:,split_feature]):
        selected_row = (current_value//1 == x[:,split_feature]//1) #Gives a  bool array 
        # Selected row as per the current_value
        selected_row_x = x[selected_row] 
        selected_row_y = y[selected_row]
        current_total =  len(selected_row_y)
        info_gain_current_value = 0
        # Calculating info_gain of one of the child node after spliting
        for current_class in set(selected_row_y):
             # p is no. of rows with current_class dvided by total no of rows
            p = (selected_row_y == current_class).sum()/total
            info_gain_current_value += -(p * np.log(p))
        # Calculating the effective info gain of all the child node after spliting
        after_split += (current_total/total) * info_gain_current_value 
    return before_split - after_split

In [6]:
def split_gain(x, y, features, split_feature):
    result = 0
    total = len(y)
    for current_value in set(x[:,split_feature]):
        selected_row = (current_value//1 == x[:,split_feature]//1) #Gives a  bool array 
        selected_row_x = x[selected_row]
        selected_row_y = y[selected_row]
        current_total =  len(selected_row_y) # total no. of data points of child current node after split
        # r is no. of rows with current_value dvided by total no of rows
        r = current_total / total
        # Formula for split_info
        result += -(r * np.log(r))
    return result

In [7]:
# Gain ratio function
def gain_ratio(x, y, features, split_feature):
    a = info_gain(x, y, features, split_feature)
    b = split_gain(x, y, features, split_feature)
    ratio = a/b
    return ratio

In [17]:
# Decision tree with data and "l" is for the level of tree ie it's height
def decision_tree(x, y, features,l):
    print("Level",l)
    # base condition
    if set(y) == 1:
        print("Leaf node")
        print("count 1:",(1 == y).sum())
        print("count 2:",(2 == y).sum())
        print("count 0:",(0 == y).sum())
        print("Entropy:" , entropy(x, y))
        print("=========================")
        return
    elif len(features) == 0:
        print("Leaf node")
        print("count 1:",(1 == y).sum())
        print("count 2:",(2 == y).sum())
        print("count 0:",(0 == y).sum())
        print("Entropy:" , entropy(x, y))
        print("=========================")
        return
    else:
        # Entropy at a given node
        print("Entropy:" , entropy(x, y))
        first_iteration = True
        maximum = 0
        split_features = None
        # Iterating through all the features to find the best split
        for i in range(len(features)):
            # Calculating gain ratio for all the features and selecting a feature with highest gain_ratio
            a = gain_ratio(x, y, features, features[i])
            if first_iteration or a > maximum:
                maximum = a
                split_feature = features[i]
            first_iteration = False
        print("Spliting upon",split_feature)
        print("With gain_ratio",maximum)
        print("count 1:",(1 == y).sum())
        print("count 0:",(0 == y).sum())
        print("count 2:",(2 == y).sum())
        print("=========================")
        # Removing the split_feature from features
        features.remove(split_feature)
         #spliting data set into group acc.to split_feature
        for current_value in set(x[:,split_feature]//1):
            # We are getting the integer value . To make data lable 
            selected_row = (current_value//1 == x[:,split_feature]//1) #Gives a  bool array 
            selected_row_x = x[selected_row]
            selected_row_y = y[selected_row]
            # Calling decision_tree on each node
            decision_tree(selected_row_x, selected_row_y, features,l+1) 

In [18]:
features = [i for i in range(x_train.shape[1])]
decision_tree(x_train, y_train, features,l=0)

Level 0
Entropy: 1.0956714129052516
Spliting upon 2
With gain_ratio -0.18884039088887258
count 1: 34
count 0: 37
count 2: 41
Level 1
Entropy: 0.0
Spliting upon 0
With gain_ratio -0.4987998970303926
count 1: 0
count 0: 37
count 2: 0
Level 2
Entropy: 0.0
Spliting upon 1
With gain_ratio -0.5135329684094694
count 1: 0
count 0: 18
count 2: 0
Level 3
Entropy: 0.0
Spliting upon 3
With gain_ratio nan
count 1: 0
count 0: 2
count 2: 0
Level 4
Leaf node
count 1: 0
count 2: 0
count 0: 2
Entropy: 0.0
Level 3
Leaf node
count 1: 0
count 2: 0
count 0: 16
Entropy: 0.0
Level 2
Leaf node
count 1: 0
count 2: 0
count 0: 19
Entropy: 0.0
Level 1
Leaf node
count 1: 9
count 2: 0
count 0: 0
Entropy: 0.0
Level 1
Leaf node
count 1: 24
count 2: 6
count 0: 0
Entropy: 0.5004024235381879
Level 1
Leaf node
count 1: 1
count 2: 27
count 0: 0
Entropy: 0.15407610367102942
Level 1
Leaf node
count 1: 0
count 2: 8
count 0: 0
Entropy: 0.0


  """
