In [1]:
import numpy as np
from sklearn.metrics import accuracy_score
import random
class Node:
    def __init__(self, predicted_class):
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None


class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def split_data(self,X,y):
        idx = np.arange(0,y.shape[0])
        percent_train = .6
        random.shuffle(idx)
        idx_train = idx[0:int(percent_train*len(X))]
        idx_test = idx[len(idx_train):len(idx)]

        X_train = X[idx_train]
        X_test = X[idx_test]
        y_train = y[idx_train]
        y_test = y[idx_test]
        return idx, X_train, y_train, X_test, y_test

    def fit(self, X, y):
        self.n_classes_ = len(set(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _best_split(self, X, y):
        m = y.size
        if m <= 1:
            return None, None
        num_parent = [np.sum(y == c) for c in range(self.n_classes_)] #[50,50,50] # Count of each class in the current node.
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)# Gini of parent node.
        best_idx, best_thr = None, None
        # Loop through all features.
        for idx in range(self.n_features_):
            # Sort data along selected feature.
            thresholds, classes = zip(*sorted(zip(X[:, idx], y))) #thresholds.. #[2, 3, 10, 19]
            num_left = [0] * self.n_classes_ #[0,0,0]
            num_right = num_parent.copy() #[50,50,50]
            for i in range(1, m):
                c = classes[i - 1]                
                num_left[c] += 1                
                num_right[c] -= 1
                
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes_)
                )
                #we divided by n_samples - i since we know that the left amount of samples
                #since left side has already i samples
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_)
                )
                #weighted gini 
                gini = (i * gini_left + (m - i) * gini_right) / m
                
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx  #feature_ix
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2 #sample_sorted[i]
        return best_idx, best_thr #best_indexfeature and thershold

    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]# each y value of each element [50,50,50]
        predicted_class = np.argmax(num_samples_per_class) # return index that get max values
        node = Node(predicted_class=predicted_class)

        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr                
                X_left, y_left = X[indices_left], y[indices_left]                
                X_right, y_right = X[~indices_left], y[~indices_left]
                
                
                #take note for later decision
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
                
        return node

    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        #choose left or right node from input
        
        return node.predicted_class

    def accuracy(self,y,ypred):
      acc=accuracy_score(y, ypred) #list
      return acc

if __name__ == "__main__":
    import sys
    from sklearn.datasets import load_iris

    dataset = load_iris()
    X, y = dataset.data, dataset.target
    clf = DecisionTree(max_depth=10)
    clf.fit(X, y)

    #pridict from input example
    print(clf.predict([[0, 0, 5, 1.5]]))

    #pridict from 1 example
    pred = clf.predict([X[0]])
    acc=clf.accuracy([y[0]],[pred])
    print(pred,acc)

    #pridict data split train test
    idx, X_train, y_train, X_test, y_test = clf.split_data(X, y)
    pred = clf.predict(X_test)
    acc=clf.accuracy(y_test,pred)
    print(pred,acc)

[2]
[0] 1.0
[0, 0, 2, 0, 2, 0, 1, 0, 2, 0, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 2, 0, 2, 2, 0, 1, 2, 1, 1, 2, 1, 2, 2, 0, 1, 0, 2, 0, 1, 2, 2, 1, 1, 0, 2, 1, 1, 0, 0, 0, 2, 2] 1.0
