In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [4]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [5]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw') 
df

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a
5,5.4,3.9,1.7,0.4,b,d,a,a
6,4.6,3.4,1.4,0.3,a,c,a,a
7,5.0,3.4,1.5,0.2,a,c,a,a
8,4.4,2.9,1.4,0.2,a,b,a,a
9,4.9,3.1,1.5,0.1,a,c,a,a


In [6]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [7]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [8]:
df['y']=((iris.target)).tolist()

In [9]:
# converted the dataframe to list as I work good lists

df=((df.to_numpy()).tolist())

In [10]:
x_train,x_test,y_train,y_test= train_test_split(df,iris.target,random_state = 2)

In [11]:
data=x_train

In [12]:
# Column labels.
# These are used only to print the steps and tree.
header=['sl_labeled','sw_labeled','pl_labeled','pw_labeled','y']

In [13]:
def unique_vals(rows, col):
    ## Find the unique values for a column in a dataset.
    
    return set([row[col] for row in rows])

In [14]:
def class_counts(rows):
    ## Counts the number of each type of example in a dataset.
    
    counts = {}  # a dictionary of label -> count.
    for row in rows:
        # in our dataset format, the label is always the last column
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [15]:
class Question:
    """
    A Question is used to partition the dataset.
    This class just records a 'column number' and a 'column value' .
    The 'match' method is used to compare
    """

    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        # Compare the feature value in an example to the
        # feature value in this question.
        val = example[self.column]
        return val == self.value

    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [16]:
def partition(rows, question):
    """Partitions the dataset based on the question.

    For each row in the dataset, check if it matches the question. If
    so, add it to 'true rows', otherwise, add it to 'false rows'.
    """
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [17]:
def gini(rows):
    ## Calculate the Gini Index for a list of rows.

    counts = class_counts(rows)
    index = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        index -= prob_of_lbl**2
    return index

In [18]:
def gini_gain(left, right, current_gini):
    ## Gain in gini index after split.
    
    p = float(len(left)) / (len(left) + len(right))
    return current_gini - p * gini(left) - (1 - p) * gini(right)

In [19]:
def find_best_split(rows):
    """Find the best question to ask by iterating over every feature / value
    and calculating the gain in gini."""
    best_gain = 0  # keep track of the best gain
    best_question = None  # keep track of the feature / value that produced it
    current_gini = gini(rows)
    n_features = len(rows[0]) - 1  # number of columns

    for col in range(n_features):  # for each feature

        values = set([row[col] for row in rows])  # unique values in the column

        for val in values:  # for each value

            question = Question(col, val)

            # try splitting the dataset
            true_rows, false_rows = partition(rows, question)

            # Skip this split if it doesn't divide the dataset.
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calculate the information gain from this split
            gain = gini_gain(true_rows, false_rows, current_gini)
            
            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [20]:
class Leaf:
    """Leaf node - classify data.

    This holds a dictionary of class -> number of times
    it appears in the rows from the training data that reach this leaf.
    """

    def __init__(self, rows):
        d = class_counts(rows)
        D={}
        if 0 in d:
           D['setosa']=d[0]
        if 1 in d:
           D['versicolor']=d[1]
        if 2 in d:
           D['virginica']=d[2]
        ## this dictionary holds the count in a leaf node
        self.predictions=D

In [21]:
class Decision_Node:
    """Decision Node - asks a question.

    This holds a reference to the question, and to the two child nodes.
    """

    def __init__(self,
                 question,
                 true_branch,
                 false_branch,
                 gain_list,
                 rows):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch
        self.gain_list = gain_list
        d = class_counts(rows)
        D={}
        if 0 in d:
           D['setosa']=d[0]
        if 1 in d:
           D['versicolor']=d[1]
        if 2 in d:
           D['virginica']=d[2]
        ## this dictionary holds the count in a non-leaf node
        self.predictions=D

In [22]:
def build_tree(rows,gain_list=[]):
    ## Builds the tree.


    # Try partitioing the dataset on each of the unique attribute,
    # calculate the gain,
    # and return the question that produces the highest gain.
    gain, question = find_best_split(rows)
    # Base case: no further gain
    # Since we can ask no further questions,
    # we'll return a leaf.
    if gain == 0:
        return Leaf(rows)
    # If we reach here, we have found a useful feature / value
    # to partition on.
    true_rows, false_rows = partition(rows, question)
    
    
    # Recursively build the true branch.
    true_branch = build_tree(true_rows,gain_list)

    # Recursively build the false branch.
    false_branch = build_tree(false_rows,gain_list)

    # Return a Question node.
    # This records the best feature / value to ask at this point,
    # as well as the branches to follow
    # dependingo on the answer.
    gain_list.append(gain)
    return Decision_Node(question, true_branch, false_branch,gain_list,rows)

In [23]:
def print_steps(node,i):
    ## tree printing function.
    
    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        print ("Count", node.predictions)
        print ("Reached leaf node")
        return
    else:
        print ("Count", node.predictions)

    # Print the question at this node
    print ('\nlevel : %d'%(i))
    print ('The best feature to split on with gain ratio %f is:'%(node.gain_list[i]))
    print ( str(node.question))
    
    i=i+1
    # Call this function recursively on the true branch
    print ('--> True:')
    print_steps(node.true_branch,i)

    # Call this function recursively on the false branch
    print ('--> False:')
    print_steps(node.false_branch,i)
    

In [24]:
def classify(row, node):

    # Base case: reached a leaf
    if isinstance(node, Leaf):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [25]:
def print_leaf(counts):
    """A nicer way to print the predictions at a leaf,
       by converting it to percentage."""
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

In [26]:
def print_tree(node, spacing=""):
    ## tree printing function.

    # Base case: reached a leaf
    if isinstance(node, Leaf):
        print (spacing + "Count", node.predictions)
        return
    
    else:
        print (spacing + "Count", node.predictions)
    # Print the question at this node
    print (spacing + str(node.question))

    # Call this function recursively on the true branch
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [27]:
# # Testing if the functions work properly
# unique_vals(data,0)

In [28]:
# # Testing if the functions work properly

# class_counts(data)

In [29]:
# # Testing if the functions work properly

# q=Question(2,'a')
# print(q)
# true_rows, false_rows = partition(data[40:60], q)
# print('\nTrue-',true_rows,'\n\nFalse-', false_rows)

In [30]:
# # Testing if the functions work properly

# current_gini=gini(data)
# gini_gain(true_rows, false_rows, current_gini)

In [31]:
# # Testing if the functions work properly

# best_gain, best_question = find_best_split(data)
# best_gain, best_question

In [32]:
# Building tree and printing the steps:

my_tree = build_tree(data)
print_steps(my_tree,0)

Count {'setosa': 34, 'versicolor': 39, 'virginica': 39}

level : 0
The best feature to split on with gain ratio 0.000659 is:
Is pw_labeled == a?
--> True:
Count {'setosa': 34}
Reached leaf node
--> False:
Count {'versicolor': 39, 'virginica': 39}

level : 1
The best feature to split on with gain ratio 0.052288 is:
Is pw_labeled == d?
--> True:
Count {'virginica': 28}
Reached leaf node
--> False:
Count {'versicolor': 39, 'virginica': 11}

level : 2
The best feature to split on with gain ratio 0.033058 is:
Is pl_labeled == d?
--> True:
Count {'virginica': 5}
Reached leaf node
--> False:
Count {'versicolor': 39, 'virginica': 6}

level : 3
The best feature to split on with gain ratio 0.027240 is:
Is sl_labeled == b?
--> True:
Count {'versicolor': 16}
Reached leaf node
--> False:
Count {'versicolor': 23, 'virginica': 6}

level : 4
The best feature to split on with gain ratio 0.019617 is:
Is sw_labeled == c?
--> True:
Count {'versicolor': 7}
Reached leaf node
--> False:
Count {'versicolor': 

In [33]:
"""
    checking the prediction on training data
    printing actual type and prediction for checking
    and confusion matrix
"""
d={0:0,1:0,2:0}
t1=0;t2=0;t3=0;f1=0;f2=0;f3=0;
y_pred_train=[]
for row in data:
    D=print_leaf(classify(row, my_tree))
    l=list(D.keys())
    predicted_label=l[0]
    y_pred_train.append(row[-1])
    if row[-1]==0:
        label='setosa'
        if predicted_label==label:
            t1+=1
        else:
            f1+=1
    if row[-1]==1:
        label='versicolor'
        if predicted_label==label:
            t2+=1
        else:
            f2+=1
    if row[-1]==2:
        label='virginica'
        if predicted_label==label:
            t3+=1
        else:
            f3+=1
    print ("Actual: %s - Predicted: %s" %
           (label, D))


data1=np.array([[t1,f1],[t2,f2],[t3,f3]])
index=["setosa","versicolor","virginica"]
conf_mat= pd.DataFrame({'true':data1[:,0],'false':data1[:,1]},index=index)
print('\n\n')
print('confusion matrix of training data:')
conf_mat

Actual: versicolor - Predicted: {'versicolor': '100%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: versicolor - Predicted: {'versicolor': '71%', 'virginica': '28%'}
Actual: versicolor - Predicted: {'versicolor': '71%', 'virginica': '28%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: versicolor - Predicted: {'versicolor': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: versicolor - Predicted: {'versicolor': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: virginica - Predicted: {'v

Unnamed: 0,true,false
setosa,34,0
versicolor,39,0
virginica,34,5


In [34]:
"""
    checking the prediction on testing data
    printing actual type and prediction for checking
    and confusion matrix
"""
d={0:0,1:0,2:0}
t1=0;t2=0;t3=0;f1=0;f2=0;f3=0;
y_pred_test=[]
for row in x_test:
    D=print_leaf(classify(row, my_tree))
    l=list(D.keys())
    predicted_label=l[0]
    y_pred_test.append( )
    if row[-1]==0:
        label='setosa'
        if predicted_label==label:
            t1+=1
        else:
            f1+=1
    if row[-1]==1:
        label='versicolor'
        if predicted_label==label:
            t2+=1
        else:
            f2+=1
    if row[-1]==2:
        label='virginica'
        if predicted_label==label:
            t3+=1
        else:
            f3+=1
    print ("Actual: %s - Predicted: %s" %
           (label, D))


data1=np.array([[t1,f1],[t2,f2],[t3,f3]])
index=["setosa","versicolor","virginica"]
conf_mat= pd.DataFrame({'true':data1[:,0],'false':data1[:,1]},index=index)
print('\n\n')
print('confusion matrix of testing data:')
conf_mat

Actual: setosa - Predicted: {'setosa': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: versicolor - Predicted: {'versicolor': '100%'}
Actual: versicolor - Predicted: {'versicolor': '71%', 'virginica': '28%'}
Actual: setosa - Predicted: {'setosa': '100%'}
Actual: versicolor - Predicted: {'versicolor': '100%'}
Actual: virginica - Predicted: {'virginica': '100%'}
Actual: versicolor - Predicted: {'

Unnamed: 0,true,false
setosa,16,0
versicolor,11,0
virginica,9,2


In [39]:
from sklearn.metrics import classification_report, confusion_matrix
print('\ntest classification_report:')
print(classification_report(y_test,np.array(y_pred_test)))
print(confusion_matrix(y_test,np.array(y_pred_test)))
print('\ntrain classification_report:')
print(classification_report(y_train,np.array(y_pred_train)))
print(confusion_matrix(y_train,np.array(y_pred_train)))


test classification_report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00        11

   micro avg       1.00      1.00      1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38

[[16  0  0]
 [ 0 11  0]
 [ 0  0 11]]

train classification_report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        34
           1       1.00      1.00      1.00        39
           2       1.00      1.00      1.00        39

   micro avg       1.00      1.00      1.00       112
   macro avg       1.00      1.00      1.00       112
weighted avg       1.00      1.00      1.00       112

[[34  0  0]
 [ 0 39  0]
 [ 0  0 39]]


In [36]:
## printing the tree

print_tree(my_tree)

Count {'setosa': 34, 'versicolor': 39, 'virginica': 39}
Is pw_labeled == a?
--> True:
  Count {'setosa': 34}
--> False:
  Count {'versicolor': 39, 'virginica': 39}
  Is pw_labeled == d?
  --> True:
    Count {'virginica': 28}
  --> False:
    Count {'versicolor': 39, 'virginica': 11}
    Is pl_labeled == d?
    --> True:
      Count {'virginica': 5}
    --> False:
      Count {'versicolor': 39, 'virginica': 6}
      Is sl_labeled == b?
      --> True:
        Count {'versicolor': 16}
      --> False:
        Count {'versicolor': 23, 'virginica': 6}
        Is sw_labeled == c?
        --> True:
          Count {'versicolor': 7}
        --> False:
          Count {'versicolor': 16, 'virginica': 6}
          Is pw_labeled == b?
          --> True:
            Count {'versicolor': 4}
          --> False:
            Count {'versicolor': 12, 'virginica': 6}
            Is sl_labeled == c?
            --> True:
              Count {'versicolor': 12, 'virginica': 5}
              Is sw_labele