<a href="https://colab.research.google.com/github/SyedIzzatUllah/Decision-Tree-Implementation-/blob/main/Decision_Tree_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn import datasets
import pandas as pd
import math as m

In [2]:
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [4]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))


In [5]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a
...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,c,b,c,d
146,6.3,2.5,5.0,1.9,c,a,c,d
147,6.5,3.0,5.2,2.0,c,b,c,d
148,6.2,3.4,5.4,2.3,c,c,d,d


In [6]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [7]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [8]:
#info for node n
def entropy(df):
    info_n=0
 #total data points
    total=df.count()[0]
 #different outputs
    dist_feat=set(df['output'])
    for i in dist_feat:
        num=df[df['output']==i].count()[0]
        p_i=num/total
        if(p_i==0):
            info_n-=0
        else:
            info_n-=p_i*m.log(p_i)
    return info_n
 

In [9]:
def show(df):
    m=set(df['output'])
    for i in m:
        k=df[df['output']==i].count()[0]
        print("count of ",i, "=" ,k)
    print("Current Entropy is =",entropy(df))

In [10]:
def gain_ratio(df,feat):
    total=df.count()[0]
    sub_parts=set(df[feat])
    info_after_split=0
    split_info=0
    for i in sub_parts:
        new_df=df[df[feat]==i]
        num=df[df[feat]==i].count()[0]
        info_after_split+=(num/total)*entropy(new_df)
        split_info-=(num/total)*m.log(num/total)
        
    info_gain=entropy(df)-info_after_split
    gain_r=info_gain/split_info
    
    return gain_r

In [12]:
def build_tree(df, y, unused_features,level):
    print("Level ",level)
    #base case
    # 1. unused is empty
    if(len(unused_features)==0):
        show(df)
        print("no features left")
        print(" ")
        return
    # 2. y contains only one distinct value 
    elif(len(set(df['output']))==1):
        show(df)
        print("leaf node")
        print(" ")
        return
    show(df)
    best_feature = ""
    max_gain=0
    for f in unused_features:
        #possible_values = set(df[f])
        # loop over possible values : val
        # find subset of df & y with f == val
        # find number of mistakes in this subset 
        # if we predict the most common y as the output
        # find sum of all these mistakes
        # update best feature so that that particular feature
        # makes least number of mistakes
        gain=gain_ratio(df,f)
        if(gain>max_gain):
            max_gain=gain
            best_feature=f
    # here you should know the best feature
    # print it out
    print("Best Feature ",best_feature,"with gain ratio ",max_gain)
    
    # remove best feature from unused features
    best_f={best_feature}
    unused_features=unused_features-best_f
    best_f={}
    to_split=set(df[best_feature])
    print(" ")
    # loop over possible values of best feature
    for i in to_split:
        mod=df[df[best_feature]==i]
        del mod[best_feature]
    # call build tree recursively
        build_tree(mod,y,unused_features,level+1)


In [13]:
y = pd.DataFrame(iris.target)
unused_features = set(df.columns)
df['output']=y
build_tree(df, y, unused_features,0)

Level  0
count of  0 = 50
count of  1 = 50
count of  2 = 50
Current Entropy is = 1.0986122886681096
Best Feature  pw_labeled with gain ratio  0.6996382036222091
 
Level  1
count of  1 = 10
Current Entropy is = 0.0
leaf node
 
Level  1
count of  0 = 50
Current Entropy is = 0.0
leaf node
 
Level  1
count of  2 = 34
Current Entropy is = 0.0
leaf node
 
Level  1
count of  1 = 40
count of  2 = 16
Current Entropy is = 0.5982695885852573
Best Feature  pl_labeled with gain ratio  0.43340994956210666
 
Level  2
count of  1 = 1
Current Entropy is = 0.0
leaf node
 
Level  2
count of  2 = 8
Current Entropy is = 0.0
leaf node
 
Level  2
count of  1 = 39
count of  2 = 8
Current Entropy is = 0.45622342016761397
Best Feature  sl_labeled with gain ratio  0.1267450377580933
 
Level  3
count of  1 = 14
Current Entropy is = 0.0
leaf node
 
Level  3
count of  2 = 1
Current Entropy is = 0.0
leaf node
 
Level  3
count of  1 = 2
Current Entropy is = 0.0
leaf node
 
Level  3
count of  1 = 23
count of  2 = 7
Cu