In [53]:
import numpy as np
import pandas as pd
import math
from sklearn import datasets

In [54]:
iris = datasets.load_iris()

In [55]:
df = pd.DataFrame(iris.data)
df.columns = ["sl","sw","pl","pw"]

## Function to find the label for a value

In [56]:
def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

## Function to convert a continous data into labelled data

In [57]:
def toLabel(df,old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (second + minimum)/2
    maximum = df[old_feature_name].max()
    third = (second + maximum)/2
    return df[old_feature_name].apply(label, args = (first, second, third))

## Convert all columns to labelled data 

In [58]:
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a
...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,c,b,c,d
146,6.3,2.5,5.0,1.9,c,a,c,d
147,6.5,3.0,5.2,2.0,c,b,c,d
148,6.2,3.4,5.4,2.3,c,c,d,d


In [59]:
df.drop(['sl','sw','pl','pw'],axis = 1,inplace =True)

In [60]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [68]:
def  gain_calculator(df, y):
    current_entropy = 0
    for class_ in set(y):
        current_entropy -= (sum(y == class_)/len(y))*np.log10(sum(y==class_)/len(y))
        
       # ***************************************************************************************** #
    
    weight_sum_of_node_entropy = 0
    split_info = 0
    
    for class_ in set(df):
        current_y_values = y[df==class_]
        entropy_of_perticular_node = 0
        
        for i in set(current_y_values):
            entropy_of_perticular_node -=(sum(current_y_values==i)/len(current_y_values))*np.log10(sum(current_y_values == i)/len(current_y_values))
            
        weight_sum_of_node_entropy += entropy_of_perticular_node*(len(current_y_values)/len(y))
        split_info -= (len(current_y_values)/len(y))*(np.log10(len(current_y_values)/len(y)))
        
    info_gain = current_entropy - weight_sum_of_node_entropy
    gain_ratio = info_gain/split_info
                
    # ********************************************************************************************** #
    
    return current_entropy,gain_ratio
    

In [69]:
def Build_tree(df, y, unused_features):
    
    #BASE CASE
    if(len(unused_features)==0) or (len(set(y))==1):
        
    # 1. UNUSED IS EMPTY
    # 2. Y CONTAINS ONLY ONE DISTINCT VALUE
        return
    best_feature = ""
    final_gain = -1
    final_entropy = 0
    
    for f in  unused_features:
        current_entropy, gain_ratio = gain_calculator(df[f], y)
        
        if gain_ratio>final_gain:
            final_gain = gain_ratio
            best_feature = f
            final_entropy= current_entropy
            
    # HERE YOU SHOULD HNOW THE BEST FEATURE
    # PRINT IT OUT 
    for i in set(y):
        print("Count of:",i,"=" ,sum(y==i))
        
    print("Current Entropy is: =", final_entropy)
    
    if final_entropy == 0:
        print('Reached leaf node')
        
    else:
        print("splitting on feature", best_feature ,"with gain ratio" , final_gain)
    print()
        
    # REMOVE BEST FEATURE FROM UNUSED FEATURES
    unused_features.remove(best_feature)
    # LOOP OVER POSSIBLE BEST FEATURES
    
    for possible_value in set(df[best_feature]):
        Build_tree(df.loc[df[best_feature]==possible_value], y[df[best_feature]==possible_value], unused_features)
    
    # CALL BUILD TREE RECURSIVELY
    

In [70]:
y =(iris.target)
unused_features =set(df.columns)
Build_tree(df, y, unused_features)

Count of: 0 = 50
Count of: 1 = 50
Count of: 2 = 50
Current Entropy is: = 0.4771212547196624
splitting on feature pw_labeled with gain ratio 0.6996382036222089

Count of: 1 = 40
Count of: 2 = 16
Current Entropy is: = 0.2598251810131059
splitting on feature pl_labeled with gain ratio 0.43340994956210654

Count of: 1 = 39
Count of: 2 = 8
Current Entropy is: = 0.1981353138938235
splitting on feature sl_labeled with gain ratio 0.12674503775809334

Count of: 1 = 23
Count of: 2 = 7
Current Entropy is: = 0.23594037110284793
splitting on feature sw_labeled with gain ratio 0.07092036405148884

