In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(r"C:\Users\Mohammed Umair\OneDrive\Desktop\DS-ML\Datasets And Notebooks\Iris.csv")

In [3]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

In [5]:
df["Species"] = lb.fit_transform(df["Species"])

In [6]:
df = df.drop(["Id"],axis=1)

In [7]:
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Node Class

In [8]:
class Node():
    def __init__(self,featrue_index = None, threshold = None, left = None, right = None, info_gain = None, value= None):
        # Constructor

        #for decision Node (the nodes which decide from where to divide the data points)
        self.feature_index = featrue_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain

        # for leaf node
        self.value = value

## Tree Class 

In [9]:
class DecisionTreeClassifier():
    def __init__(self,min_samples_split = 2, max_depth = 2):
        '''Constructor'''

        # initialize the roof of the tree
        self.root = None
        
        # stopping condition of our decision tree
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
    
    def build_tree(self,dataset,curr_depth = 0):
        '''recursive function to build the tree'''

        x,y = dataset[:,:-1],dataset[:,-1]
        num_samples, num_features = np.shape(x)

        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth <=self.max_depth:
            # fine the bestt split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positve
            if best_split["info_gain"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                right_subtree = self.build_tree(best_split["dataset_right"],curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"],best_split["threshold"], left_subtree,right_subtree, best_split["info_gain"])
        # compute leaf node
        leaf_value = self.calculate_leaf_value(y)
        # return leaf node
        return Node(value = leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split'''

        # dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")

        # loop over all the features to figure out the best possible root to start with and then its also used for the rest of the decision nodes

        for feature_index in range(num_features):
            feature_values = dataset[:,feature_index]
            possible_thresholds = np.unique(feature_values)

            # loop over all the featrue values present in the data
            for threshold in possible_thresholds:
                #get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:,-1],dataset_left[:,-1],dataset_right[:,-1]
                    # compute information gain
                    curr_info_gain = self.information_gain(y,left_y,right_y,"gini")
                    # update the best split if needed
                    if curr_info_gain> max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain
        # return best split
        return best_split
    
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data'''

        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def information_gain(self, parent, l_child, r_child, mode = "entropy"):
        ''' function to compute information gain'''

        weight_l = len(l_child)/len(parent)
        weight_r = len(r_child)/len(parent)
        if mode =="gini":
            gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child)+ weight_r*self.gini_index(r_child))
        else: 
            gain = self.entropy(parent) - (weight_l*self.entropy(l_child)+ weight_r*self.entropy(r_child))
        return gain
    
    def entropy(self,y):
        ''' function to compute entropy'''
        class_labels, counts = np.unique(y, return_counts=True)
        probaility = counts/len(y)
        entropy = -np.sum(probaility* np.log2(probaility))
        return entropy
    
    def gini_index(self,y):
        ''' function to compute gini impurity'''
        class_labels, counts = np.unique(y,return_counts=True)
        probability = counts/len(y)
        gini = 1 - np.sum(probability**2)
        return gini
    
    def calculate_leaf_value(self,y):
        ''' function to compute leaf node'''
        y = list(y)
        return max(y, key = y.count)
    
    def fit(self,x,y):
        ''' function to train the tree'''
        dataset = np.concatenate((x,y),axis=1)
        self.root = self.build_tree(dataset)
    
    def predict(self,x):
        ''' function to predict new dataset'''

        prediction = [self.make_prediction(a,self.root) for a in x]
        return prediction
    
    def make_prediction(self,x,tree):
        ''' function to predict a single data point'''

        if tree.value!= None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

In [10]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

In [11]:
classifier = DecisionTreeClassifier(min_samples_split=3, max_depth=3)
classifier.fit(X_train,Y_train)

In [12]:
Y_pred = classifier.predict(X_test) 
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)

0.9333333333333333