In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics

### Loading Iris Datasets
https://www.kaggle.com/uciml/iris

In [2]:
df=pd.read_csv('Iris.csv')

In [3]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
df.shape

(150, 6)

In [5]:
df.dtypes

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [6]:
df.drop(columns=['Id'],inplace=True)
df.rename(columns={'Species': 'label'},inplace=True)

### train test split

In [7]:
df_train,df_test=train_test_split(df,test_size=1/5,shuffle=True)
df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)

In [8]:
print(df_train.shape,df_test.shape)

(120, 5) (30, 5)


In [9]:
df_train.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,label
0,6.8,2.8,4.8,1.4,Iris-versicolor
1,5.4,3.9,1.3,0.4,Iris-setosa
2,5.6,2.5,3.9,1.1,Iris-versicolor
3,5.9,3.2,4.8,1.8,Iris-versicolor
4,6.0,2.2,4.0,1.0,Iris-versicolor


### Making decision tree classifier

#### Entropy and Information Gain

In [10]:
# function to calculate entropy of a system
def entropy(df):
    '''
    df: pandas DataFrame with numeric feature value, dataframe target column must be label.
    Returns: Float (entropy)
    '''
    count=[]
    for i in df['label'].value_counts():
        count.append(i)
    count=np.array(count)
    prob=count/count.sum()
    entropy=(-1)*np.sum(prob*np.log2(prob))
    return entropy

# function to calculate information gain about a particular feature
def infoGain(df,f,fval):
    ''' 
    df: pandas DataFrame with numeric feature value, dataframe target column must be label.
    f: feature about which splitting is required,
    fval: split value of the feature
    Returns: Float (entropy)'''
    old=entropy(df)
    l=df.shape[0]
    l1=df.loc[df[f]>=fval].shape[0]
    l2=df.loc[df[f]<fval].shape[0]
    new=(l1/l)*entropy(df.loc[df[f]>=fval])+(l2/l)*entropy(df.loc[df[f]<fval])
    return old-new

#### Decision Tree Class

In [19]:
class Dtree:
    ''' 
    Decision Tree Class
    depth: current depth of the node
    max_depth: maximum depth upto which tree should be grown
    '''
    def __init__(self,depth=1,max_depth=4):
        self.pred=None
        self.feature=None
        self.fval=None
        self.left=None
        self.right=None
        self.label_idx=None
        self.max_depth=max_depth
        self.depth=depth

    def train(self,df):
        '''
        Method to build the decision tree.
        Arguments:
        df: pandas DataFrame with numeric feature value, dataframe target column must be label.
        '''
        if self.label_idx is None:
            np.unique(df['label'].values.flatten())
            self.label_idx={}
            for i,k in enumerate(labels):
                self.label_idx[k]=i

        self.pred=df['label'].value_counts().idxmax()
        if self.depth==self.max_depth:
            return
        if df['label'].value_counts().shape[0]>1:
            print('Computing Information Gain!!')
            info_max=df.columns[0]
            val=infoGain(df,info_max,df[info_max].mean())
            print('(feature={},infoGain={:.2f})'.format(df.columns[0],val))
            for f in df.columns[1:-1]:
                temp_val=infoGain(df,f,df[f].mean())
                print('(feature={},infoGain={:.2f})'.format(f,temp_val))
                if temp_val>val:
                    val=infoGain(df,f,df[f].mean())
                    info_max=f
            self.feature=info_max
            self.fval=df[self.feature].mean()
            df_left=df.loc[df[self.feature]<self.fval].reset_index(drop=True)
            df_right=df.loc[df[self.feature]>=self.fval].reset_index(drop=True)
            print("\nsplit_feature={}, left={} and right={}".format(self.feature,df_left.shape[0],df_right.shape[0]),end='\n\n')
            self.left=Dtree(self.depth+1,self.max_depth)
            self.left.label_idx=self.label_idx
            self.left.train(df_left)
            self.right=Dtree(self.depth+1,self.max_depth)
            self.right.label_idx=self.label_idx
            self.right.train(df_right)
    
    def predict(self,node,series,curr_depth):
        if curr_depth==node.max_depth:
            return node.pred
        if node.left is None:
            return node.pred
        if series[node.feature]<node.fval:
            return node.predict(node.left,series,curr_depth+1)
        if series[node.feature]>=node.fval:
            return node.predict(node.right,series,curr_depth+1)

    def predict_df(self,df,target=None):
        '''
        Method to predict labels of a given dataframe
        Arguments:
        df: pandas DataFrame with numeric feature value.
        target(optional): name of target column
        '''
        label_idx=self.label_idx
        y_=[]
        y_pred=[]
        for i in range(df.shape[0]):
            val=self.predict(self,df.loc[i],1)
            y_pred.append(val)
            y_.append(label_idx[self.predict(self,df.loc[i],1)])
        y_=np.array(y_)
        result={'prediction':y_pred}
        if target is not None:
            y_true=df.label.apply(lambda x:label_idx[x]).values.flatten()
            accuracy=((y_true==y_).sum()/len(y_true))*100
            result['accuracy']=accuracy
            result['metrics']=metrics.classification_report(y_true,y_)
        return result

### training and testing

In [20]:
dtree=Dtree()
dtree.train(df_train)

Computing Information Gain!!
(feature=SepalLengthCm,infoGain=0.48)
(feature=SepalWidthCm,infoGain=0.29)
(feature=PetalLengthCm,infoGain=0.77)
(feature=PetalWidthCm,infoGain=0.72)

split_feature=PetalLengthCm, left=48 and right=72

Computing Information Gain!!
(feature=SepalLengthCm,infoGain=0.00)
(feature=SepalWidthCm,infoGain=0.17)
(feature=PetalLengthCm,infoGain=0.41)
(feature=PetalWidthCm,infoGain=0.24)

split_feature=PetalLengthCm, left=40 and right=8

Computing Information Gain!!
(feature=SepalLengthCm,infoGain=0.12)
(feature=SepalWidthCm,infoGain=0.47)
(feature=PetalLengthCm,infoGain=0.81)
(feature=PetalWidthCm,infoGain=0.81)

split_feature=PetalLengthCm, left=2 and right=6

Computing Information Gain!!
(feature=SepalLengthCm,infoGain=0.08)
(feature=SepalWidthCm,infoGain=0.02)
(feature=PetalLengthCm,infoGain=0.57)
(feature=PetalWidthCm,infoGain=0.65)

split_feature=PetalWidthCm, left=38 and right=34

Computing Information Gain!!
(feature=SepalLengthCm,infoGain=0.00)
(feature=Sepa

### Results

In [25]:
result=dtree.predict_df(df_test,target='label')

In [28]:
print('accuracy={:.2f}'.format(result['accuracy']),result['metrics'],sep='\n')

accuracy=96.67
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      0.89      0.94         9
           2       0.93      1.00      0.96        13

    accuracy                           0.97        30
   macro avg       0.98      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30



#### Confusion Matrix

In [29]:
def createCFM(y_test, predictions):
    col=np.unique(y_test)
    mat=metrics.confusion_matrix(y_test, predictions)
    data={col[n]:mat.T[n] for n in range(len(col))}
    df=pd.DataFrame(data)
    df.index=[i+'-Real' for i in col]
    return df

In [30]:
createCFM(df_test['label'], result['prediction'])

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
Iris-setosa-Real,8,0,0
Iris-versicolor-Real,0,8,1
Iris-virginica-Real,0,0,13
