In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import math as m

In [2]:
#entropy function for decison tree.
def entropy(y):
    set_val = set(y)
    total = len(y)
    single_entropy = 0
    for l in set_val:
        p = (y == l).sum() / total
        single_entropy += -p*m.log2(p)
    return single_entropy

In [3]:
#info gain function for decision tree.
def info_gain(x , y , index):
    initial_entropy = entropy(y)
    step = x[x.columns[index]].mean()
    init = step/2
    condition = 0
    maxm = -1e10
    split_info_max = -1e10
    while init <= 2*step :
        final_entropy = 0
        y_new_greater  = y.loc[x[x.columns[index]] >= init]
        y_new_less  = y.loc[x[x.columns[index]] < init]
        final_entropy = (entropy(y_new_greater)) + (entropy(y_new_less))
        if final_entropy > maxm:
            maxm = final_entropy
            condition = init
            split_info_max = split_info(y_new_greater , y_new_less) 
        init += init
    return maxm - entropy(y), condition , split_info_max

In [4]:
def split_info(y1 , y2) :
    d1 = len(y1)
    d2 = len(y2)
    d = d1 + d2
    if(d1 == 0 or d2 == 0 or d == 0):
        return 0
    p = (-1*d1*m.log2(d1/d))/d +  (-1*d2*m.log2(d2/d))/d
    return p

In [5]:
def gain_ratio(x ,y ,index):
    return info_gain(x , y , index)[0] / info_gain(x , y , index)[2]

In [6]:
def dt_new(x , y , feature_index , level):
    
    #return if pure node in encountered . 
    if(len(set(y)) == 1): 
        print('Level in the tree is  : ' , level)
        print('Current entropy is  : ' , 0.0)
        print('Count of feature :' , len(y))
        print('Reached leaf node')
        print()
        return
    
    #if all features are traversed
    
    if(len(feature_index) == 0):
        print('Level is  : ' , level)
        print('Current entropy is : ' , entropy(y))
        print('Leaf node reached , No features left!')
        print()
        return
    maxm = -10000000
    best_feature = -1
    condition = 0
    for i in feature_index:
        gain = gain_ratio(x , y , i)
        if gain > maxm:
            maxm = gain
            best_feature = i
            condition = info_gain(x , y , i)[1]
    # printing data
    
    print('Level is  : ' , level)
    print('Current entropy is -   : ' , entropy(y))
    print('Splitting on feature - ' , x.columns[best_feature] ,  '-:with gain ratio of -   : ' , maxm)
    print('num_features :' , len(y))
    print()
    
    # class to split upon.
    labels = set(x[x.columns[best_feature]])
    
    #removing features traversed
    if best_feature not in feature_index:
        return
    feature_index.remove(best_feature)
    
    x_new_greater  = x.loc[x[x.columns[best_feature]] >= condition]
    y_new_greater  = y.loc[x[x.columns[best_feature]] >=  condition]
    x_new_lesser  = x.loc[x[x.columns[best_feature]] < condition]
    y_new_lesser  = y.loc[x[x.columns[best_feature]] <  condition]
    
    #recursively calling to print 
    dt_new(x_new_greater , y_new_greater , feature_index , level + 1)
    dt_new(x_new_lesser , y_new_lesser , feature_index , level + 1)
    feature_index.append(best_feature)

In [7]:
iris = datasets.load_iris()

#  converting dataset to dataframe
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],columns= iris['feature_names'] + ['target'])
df.columns = ['sl' , 'sw' , 'pl' , 'pw' , 'target']

#  dataset into test and train 
x_train , x_test = train_test_split(df.iloc[: , 0:-1] , random_state = 20)
y_train , y_test = train_test_split(df.iloc[: , -1] , random_state = 20)
list_of_features = x_train.columns.tolist()



#taking out feature indices(column indices)
feature_idx = [i for i in range(len(x_train.columns))]
feature_idx

[0, 1, 2, 3]

In [8]:
feature_idx = [i for i in range(len(x_train.columns))]
dt_new(x_train , y_train , feature_idx , 0)

  


Level is  :  0
Current entropy is -   :  1.5844996446144277
Splitting on feature -  sw -:with gain ratio of -   :  1.0502869058388165
num_features : 112

Level is  :  1
Current entropy is -   :  1.3090909691036434
Splitting on feature -  pl -:with gain ratio of -   :  0.1618696099795078
num_features : 55

Level is  :  2
Current entropy is -   :  1.4690640517640445
Splitting on feature -  sl -:with gain ratio of -   :  0.22174921219934826
num_features : 31

Level is  :  3
Current entropy is -   :  0.8112781244591328
Splitting on feature -  pw -:with gain ratio of -   :  0.18110652893703225
num_features : 20

Level in the tree is  :  4
Current entropy is  :  0.0
Count of feature : 11
Reached leaf node

Level is  :  4
Current entropy is :  0.9910760598382222
Leaf node reached , No features left!

Level is  :  3
Current entropy is -   :  0.8658566174572235
Splitting on feature -  pw -:with gain ratio of -   :  0.2992387810369147
num_features : 11

Level is  :  4
Current entropy is :  1.148