In [2]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type']
data = pd.read_csv("iris.csv", skiprows=1, header=None, names=  col_names)
data.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


## Node class

In [None]:
class Node:
    def __init__(self, feature_index=None, threshold= None, left=None, right=None, info_gain=None, value=None):
        # for internal nodes 
        self.feature_index= feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        # for lead nodes
        self.value = value

class DecisionTreeClassifier:
    def __init__(self, max_depth = 2, min_samples_split = 2):
        # initialize the root
        self.root = None

        #stopping condition
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def build_tree(self,dataset,cur_depth=0):
        ''' to build a prtiular node '''
        X,Y = dataset[:,:-1] , dataset[:,-1]
        num_samples , num_features = np.shape(X)


        #split until stopping conditions are met
        if cur_depth <= self.max_depth and num_samples >= self.min_samples_split:
            #find the best split
            best_split = self.get_best_split(dataset,num_samples, num_features)
            #check if information gain is positive
            if best_split['info_gain'] > 0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], cur_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], cur_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], left_subtree, right_subtree, best_split["info_gain"])
            
            # compute leaf node 
            leaf_value = self.calculate_leaf_values(Y)
            #return leaf node
            return Node(value = leaf_value)
        
        def get_best_split(self, dataset, num_samples, num_features):
            ''' function to find the best split'''

            # dictionary to store the best split
            best_split = {}
            max_info_gain = -float("inf")

            #loop over all the features
            for feature_index in range(num_features):
                feature_values = dataset[:, feature_index]
                possible_thresholds = np.unique(feature_values)

                for threshold in possible_thresholds:
                    # get current split
                    dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                    # check if chlidren are not null
                    if len(dataset_left) > 0 and len(dataset_right) > 0:
                        y , left_y, right_y = dataset[:, -1] , dataset_left[:, -1], dataset_right[:, -1]
                        # compute information gain
                        cur_info_gain = self.information_gain(y, left_y, right_y, "gini")
                        #update the best split if needed
                        if cur_info_gain > max_info_gain:
                            best_split["feature_index"] = feature_index
                            best_split['threshold'] = threshold
                            best_split["dataset_left"] = dataset_left
                            best_split["dataset_right"] = dataset_right
                            max_info_gain = cur_info_gain
                            best_split["info_gain"] = max_info_gain

            
            return best_split
        
        def split(self, dataset, feature_index, threshold):
            ''' function to split the data '''
            dataset_left  = np.array([row for row in dataset if row[feature_index] <= threshold])
            dataset_right = np.array([row for row in dataset if row[feature_index] > threshold])
            return dataset_left, dataset_right
        
        def information_gain(self, parent, l_child, r_child, mode = "entropy"):
            ''' function to compute information gain'''

            weight_l = len(l_child) / len(parent)
            weight_r = len(r_child) / len(parent)

            if mode == "gini":
                gain = self.gini_index(parent) - weight_l*self.gini_index(l_child) - - weight_r*self.gini_index(r_child)
            else:
                gain = self.entropy(parent) - (weight_l * self.entropy(l_child)) - (weight_r*self.entropy(r_child))

            return gain

        def entropy(self,y):
            ''' function to compute entropy '''

            class_labels = np.unique(y)
            entropy = 0
            for cls in class_labels:
                p_cls = len(y[y == cls]) / len(y)
                entropy += -p_cls * np.log2(p_cls)
            
            return entropy
        
        def gini_index(self, y):
            ''' function to compute gini index'''
            class_labels = np.unique(y)
            gini = 0
            for cls in class_labels:
                p_cls = llen(y[y == cls]) / len(y)
                gini += p_cls**2
            return 1 - gini
        
        def calculate_leaf_value(self, y):
            y = list(y)
            return max(y, key = y.count)
        
        def print_tree(self, tree=None, indent = " "):
            pass
        
        def fit(self, X, Y):
            ''' function to train the tree '''

            dataset = np.concatenate((X,Y), axis = 1)
            self.root = self.build_tree(dataset)

        def predict(self , X):
            ''' function to predict new dataset '''

            predictions = [self.make_prediction(x,self.root) for x in x]
            return predictions
        
        def make_prediction(self, x, tree):
            ''' function to predict a single data point'''
            
            if tree.value != None: return tree.value
            feature_val = x[tree.feature_index]
            if feature_val <= tree.threshold:
                return self.make_prediction(x, tree.left)
            else:
                return self.make_prediction(x, tree.right)
            

# Train-Test Split

In [None]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = .2, random_state=42)


# Fit the model

In [None]:
classifier = DecisionTreeClassifier(min_samples_split=3, max_depth=3)
classifier.fit(X_train, Y_train)

# Test the model

In [None]:
Y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)