In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Loading the iris dataset
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'
iris = pd.read_csv(url)

In [3]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [5]:
iris.shape

(150, 5)

In [6]:
# Counts the number of null values, This is used to check if cleaning is required
iris.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [7]:
# Gives details about distribution of data
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [8]:
# This is similar to describe, but here we group by species
metrics = ['count', 'min', 'max', 'mean','std','skew']
iris.groupby(by='species').agg(metrics)

Unnamed: 0_level_0,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_width,sepal_width,sepal_width,sepal_width,...,petal_length,petal_length,petal_length,petal_length,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width
Unnamed: 0_level_1,count,min,max,mean,std,skew,count,min,max,mean,...,max,mean,std,skew,count,min,max,mean,std,skew
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
setosa,50,4.3,5.8,5.006,0.35249,0.120087,50,2.3,4.4,3.428,...,1.9,1.462,0.173664,0.106394,50,0.1,0.6,0.246,0.105386,1.253861
versicolor,50,4.9,7.0,5.936,0.516171,0.105378,50,2.0,3.4,2.77,...,5.1,4.26,0.469911,-0.606508,50,1.0,1.8,1.326,0.197753,-0.03118
virginica,50,4.9,7.9,6.588,0.63588,0.118015,50,2.2,3.8,2.974,...,6.9,5.552,0.551895,0.549445,50,1.4,2.5,2.026,0.27465,-0.129477


In [9]:
# Assigning class labels to the data
labels = iris["species"]
labels = labels.replace({"setosa": 0, "versicolor": 1, "virginica": 2})
print(labels.head())

0    0
1    0
2    0
3    0
4    0
Name: species, dtype: int64


In [10]:
# Seperating the labels from the features
iris = iris.drop(["species"], axis=1)

In [25]:
def BuildTree(start, end, k = 3):
    if (end - start) < k:
        count = [0, 0, 0]
        for i in range(start, end):
            count[labels[i]] += 1

        # We assign the label with maximum count as a leaf node
        max_idx = count.index(max(count))
        print("Leaf node: ", max_idx)
        return
    else:
        split_idx = 0
        split_val = 0
        min_purity = 1
        # For each feature, we try to find the best split index
        for i in range(iris.shape[1]):
            temp_split_idx = i
            col = iris.columns[i]
            unique = iris[col].unique()
            iris.sort_values(by=[col], inplace=True)
            # For each data instance, we try to find the best split value
            for j in range(1, unique.shape[0]):
                # We consider the midpoint of two consecutive data instances after sorting
                temp_split_val = (unique[j] + unique[j-1])/2
                left = [0, 0, 0]
                right = [0, 0, 0]
                # Seperate all the data values in two halves as per their value
                for k in range(iris.shape[0]):
                    if iris.iloc[k, i] < temp_split_val:
                        left[labels[k]] += 1
                    else:
                        right[labels[k]] += 1
                left_sum = sum(left)
                right_sum = sum(right)
                print(left[0], left[1], left[2], right[0], right[1], right[2])
                print(left_sum, right_sum)
                gini = 1 - ((left_sum / iris.shape[0]) * ((left[0] / left_sum) ** 2 + (left[1] / left_sum) ** 2 + (left[2] / left_sum) ** 2)) - ((right_sum / iris.shape[0]) * ((right[0] / right_sum) ** 2 + (right[1] / right_sum) ** 2 + (right[2] / right_sum) ** 2))


                if (gini < min_purity):
                    min_purity = gini
                    split_idx = temp_split_idx
                    split_val = temp_split_val

        print("Split index: ", split_idx)
        print("Split value: ", split_val)
        print("Gini: ", min_purity)



In [26]:
BuildTree(0, iris.shape[0], 2)

5 0 0 45 50 50
5 145
5 0 0 45 50 50
5 145
32 0 0 18 50 50
32 118
32 0 0 18 50 50
32 118
11 0 0 39 50 50
11 139
16 0 0 34 50 50
16 134
45 0 0 5 50 50
45 105
32 0 0 18 50 50
32 118
32 0 0 18 50 50
32 118
46 0 0 4 50 50
46 104
50 9 0 0 41 50
59 91
32 0 0 18 50 50
32 118
32 0 0 18 50 50
32 118
32 0 0 18 50 50
32 118
45 0 0 5 50 50
45 105
50 23 0 0 27 50
73 77
50 30 0 0 20 50
80 70
50 50 8 0 0 42
108 42
50 50 15 0 0 35
115 35
50 49 0 0 1 50
99 51
50 50 8 0 0 42
108 42
50 50 22 0 0 28
122 28
50 50 33 0 0 17
133 17
50 50 30 0 0 20
130 20
50 50 33 0 0 17
133 17
50 50 8 0 0 42
108 42
50 45 0 0 5 50
95 55
50 50 33 0 0 17
133 17
50 50 42 0 0 8
142 8
50 50 43 0 0 7
143 7
50 50 45 0 0 5
145 5
50 50 49 0 0 1
149 1
50 50 45 0 0 5
145 5
50 50 43 0 0 7
143 7
50 33 0 0 17 50
83 67
50 33 0 0 17 50
83 67
19 0 0 31 50 50
19 131
47 0 0 3 50 50
47 103
50 50 7 0 0 43
107 43
50 50 13 0 0 37
113 37
50 33 0 0 17 50
83 67
11 0 0 39 50 50
11 139
50 7 0 0 43 50
57 93
33 0 0 17 50 50
33 117
24 0 0 26 50 50
24 126
50