In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np

In [2]:
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [4]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [5]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a
5,5.4,3.9,1.7,0.4,b,d,a,a
6,4.6,3.4,1.4,0.3,a,c,a,a
7,5.0,3.4,1.5,0.2,a,c,a,a
8,4.4,2.9,1.4,0.2,a,b,a,a
9,4.9,3.1,1.5,0.1,a,c,a,a


In [6]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [7]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [8]:
import math

In [9]:
def t_entropy(x, y):
    result = 0
    total = len(y) # total no. of data points
    for current_class in set(y):
        # p is no. of rows with current_class dvided by total no of rows
        p = (y == current_class).sum()/total
        # Formula for entropy
        result += -(p * math.log(p))
    return result

In [16]:
def entropy(df,y,selected_feature):
    label_counter=0                                           #counts occurrence of each class in feature
    prob_label=0                                              
    possible_values = set(df[selected_feature])               
    total_length=len(df[selected_feature])
    temp=0
    for v in possible_values:
        label_counter=len(df[df[selected_feature]==v])
        prob_label=label_counter/total_length                # gives the value of count of label/total length(d1/d)
        for current_class in set(y):
            p = (df[selected_feature] == current_class).sum()/label_counter
            if(p!=0):
                temp += -1 * p * np.log(p)                    #finds total info gain on that feature (-p[i] * log(p[i]))
            else:
                continue
    return prob_label * temp        

In [11]:
def split_gain(x, y,split_feature):
    result = 0
    total = len(y)
    for current_value in set(x[split_feature]):
        selected_row = (current_value == x[split_feature]) #Gives a  bool array 
        selected_row_x = x[selected_row]
        selected_row_y = y[selected_row]
        current_total =  len(selected_row_y) # total no. of data points of child current node after split
        # r is no. of rows with current_value dvided by total no of rows
        r = current_total / total
        # Formula for split_info
        result += (-r * np.log(r))
    return result

In [12]:
def gain(df,y,selected_feature):
    information_gain=t_entropy(df,y)- entropy(df,y,selected_feature)
    split_info=split_gain(df,y,selected_feature)
    return information_gain/split_info

In [13]:
def countclasses(df,y):
    counter=0
    pos_values=set(y)
    for a in range(len(pos_values)):
        counter=0
        for A in range(0,len(y)):
            if a==y[A]:
                counter+=1
        print("Count of class ",a,end=" ")
        print(" = ",counter)

In [14]:
def build_tree(df, y, unused_features,level_counter=0):
    #base case
    # 1. unused is empty
    # 2. y contains only one distinct value
    
    
    if set(y)==1 or len(unused_features)==0:
        print("Level ",level_counter)
        countclasses(df,y)
        print("Current Entropy = ",t_entropy(df,y))
        print("Reached Leaf Node")
        return
    
    max_gain=-1
    split_f=""
    label_counter=0
    #print(unused_features)
    for f in unused_features:
        possible_values = set(df[f])
        # loop over possible values : val
        # find subset of df & y with f == val
        # find number of mistakes in this subset 
        # if we predict the most common y as the output
        # find sum of all these mistakes
        # update best feature so that that particular feature
        # makes least number of mistakes
        temp_gain=gain(df,y,f)
        if temp_gain>max_gain:
            max_gain=temp_gain
            split_f=f
    if (max_gain==-1) :
        return
    # here you should know the best feature
    # print it out
    print("Level ",level_counter)
    level_counter+=1
    countclasses(df,y)
    print("Current Entropy: ",t_entropy(df,y))
    print("Splitting on feature ",split_f,end="")
    print(" with gain ratio ",max_gain)
    diff_labels=set(df[split_f])
    #print(type(unused_features),len(unused_features))
    unused_features.remove(split_f)
    #print(type(unused_features),len(unused_features))
    for i in diff_labels:
        new_samples=df[df[split_f]==i]
        include=(df[split_f]==i)
        new_df=new_samples
        new_y=y[include]
        #new_y=np.array(new_y,dtype='float')
        #print(type(new_df),type(new_y))
        build_tree(new_df,new_y,unused_features,level_counter)
    # remove best feature from unused features
    # loop over possible values of best feature
    # call build tree recursively

In [15]:
y = iris.target
unused_features = set(df.columns)
build_tree(df, y, unused_features)

Level  0
Count of class  0  =  50
Count of class  1  =  50
Count of class  2  =  50
Current Entropy:  1.0986122886681096
Splitting on feature  sw_labeled with gain ratio  0.9192883564092794
Level  1
Count of class  0  =  1
Count of class  1  =  13
Count of class  2  =  5
Current Entropy:  0.7659373365785205
Splitting on feature  pl_labeled with gain ratio  0.8049889946966282
Level  2
Count of class  0  =  0
Count of class  1  =  8
Current Entropy:  0.6365141682948128
Splitting on feature  sl_labeled with gain ratio  0.6930475957884882
Level  3
Count of class  0  =  0
Count of class  1  =  4
Current Entropy:  0.6365141682948128
Splitting on feature  pw_labeled with gain ratio  0.733680436651211
Level  4
Count of class  0  =  0
Count of class  1  =  3
Current Entropy =  0.5623351446188083
Reached Leaf Node
Level  4
Count of class  0  =  0
Current Entropy =  0.0
Reached Leaf Node
Level  4
Count of class  0  =  0
Current Entropy =  0.0
Reached Leaf Node
Level  3
Count of class  0  =  0
Cur

  after removing the cwd from sys.path.
