In [52]:
import numpy as np
import os
from numpy.random import choice
from collections import Counter

In [53]:
######## NODE CLASS #######
class Node:
    def __init__(self,left=None,right=None,threshold=None,feature=None,value=None):
        self.left      = left
        self.right     = right
        self.threshold = threshold
        self.feature   = feature
        self.value     = value
    def is_leaf(self): #checks if it is a leaf node
        if self.value == None:
            return False
        else:
            return True

In [54]:
####### DECISION CLASS #########
class Decision_tree:
    def __init__(self,max_depth=100,min_sample_split=2,nfeat=None):
        self.max_depth        = max_depth
        self.min_sample_split = min_sample_split
        self.nfeat            = nfeat
        self.root             = None
    def fit(self,X,y):
        self.root = self.grow_tree(X,y)
    def grow_tree(self,X,y,depth=0):
        #MAKE A PREDICTION IF CONDITIONS ARE MET
        nsamples,nfeatures = np.shape(X)
        if (nsamples == self.min_sample_split) or (depth >= self.max_depth) or (len(np.unique(y)) > 1):
            value = self.most_frequent_value(y)
            return Node(value=value)
        #SELECT THE FEATURE SUBSET TO BE CONSIDERED
        if (self.nfeat is None):
            feat_subset = np.arange(nfeatures) #getting all features
        else:
            feat_subset = choice(nfeatures,self.nfeat,replace=False) #gettin a subset
        #EVALUATE IG FOR EACH CONFIG
        best_IG      = -1
        best_thresh  = None
        best_feature = None
        for feat in feat_subset:
            X_column   = X[:,feat]
            thresholds = np.unique(X_column)
            for thresh in thresholds:
                left_indx, right_indx = self.split_by_thresh(X_column,thresh)
                IG  = self.information_gain(y,left_indx,right_indx)
                if IG > best_IG:
                    best_IG      = IG
                    best_thresh  = thresh
                    best_feature = feature
        #EXECUTE THE SPLIT
        left_indx, right_indx = self.split_by_thresh(X_column,best_thresh)
        #GROW_TREE LEFT AND RIGHT
        child_l = self.grow_tree(X[left_indx,:] ,y[left_indx,:] ,depth=depth+1)
        child_r = self.grow_tree(X[right_indx,:],y[right_indx,:],depth=depth+1)
        #RETURN THE NODE(LEFT,RIGHT)
        return Node(left=child_l,right=child_r)
    def most_frequent_value(self,y):
        return Counter(y).most_common()[0][0]
    def split_by_thresh(X_column,thresh):
        left_side  = np.argwhere(X_column <= thresh).flatten() #data that do not conforms with the threshold
        right_side = np.argwhere(X_column > thresh).flatten()
        return left_side,right_side
    def information_gain(self,y,left_indx,right_indx):
        #calc parent's entropy
        par_en = self.entropy(y)
        #calculate childs' entropies
        y_l = y[left_indx]
        y_r = y[right_indx]
        en_l = self.entropy(y_l)
        en_r = self.entropy(y_r)
        #calculate IG
        IG  = par_en - (len(y_l)/len(y))*en_l - (len(y_r)/len(y))*en_r
        #return IG
        print(IG)
        return IG
    def entropy(self,y):
        pp = np.bincount(y)/len(y)
        return sum([-p*np.log(p) for p in pp])
    def predict(self,X):
        return [self.check_tree_val(x,self.root) for x in X]
    def check_tree_val(self,x,node):
        if node.is_leaf():
            return self.root.value
        if x[node.feature] <= node.threshold:
            return check_tree_val(self,x,node.left)
        else:
            return check_tree_val(self,x,node.right)

In [57]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1234
)

clf = Decision_tree(max_depth=100)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)

acc = accuracy(y_test, predictions)
print(acc)
print(predictions)

0.6052631578947368
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
