In [25]:
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
%matplotlib inline

In [113]:
# Preparing the data

data = load_breast_cancer()

In [114]:
X,y=data.data,data.target

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Decision Tree


In [99]:
class Node_new:
    def __init__(self,best_col=None,best_val=None,left_child=None,right_child=None,*,value=None):
        self.best_col=best_col
        self.best_val=best_val
        self.left_child=left_child
        self.right_child=right_child
        self.value=value
    def is_leaf_node(self):
        if self.value!=None:
            return True

class Decision_Tree:
    def __init__(self,max_depth=5,min_sample=2):
        self.max_depth=max_depth
        self.min_sample=min_sample
        self.root=None
    def fit(self,X,y):
        self.root=self.grow_tree(X,y)
    def grow_tree(self,X,y,depth=0):
        uniq_lable=np.unique(y)
        if (len(uniq_lable)==1) or (depth==self.max_depth) or (X.shape[0]<self.min_sample):
            leaf_value=self.most_common_lable(y)
            return Node_new(value=leaf_value)
        else:
        
            best_col,best_val=self.get_best_feature(X,y)
            #print(best_col,best_val)
            left_indx,right_indx=self.split_data(X[:,best_col],best_val)
            
            left_child=self.grow_tree(X[left_indx],y[left_indx],depth+1)
            
            right_child=self.grow_tree(X[right_indx],y[right_indx],depth+1) 
            
            return Node_new(best_col,best_val,left_child,right_child)
    def predict(self,X_test):
        self.prediction=np.array([self.help_predict(x,self.root) for x in X_test])
        return self.prediction
    def help_predict(self,x,current_node):
        
        if current_node.is_leaf_node():
            return current_node.value
        
        if x[current_node.best_col]<=current_node.best_val:
            return self.help_predict(x,current_node.left_child)
        
        elif x[current_node.best_col]>current_node.best_val:
            return self.help_predict(x,current_node.right_child)
        
    def get_best_feature(self,X,y):
        n_samples,n_col=X.shape
        gain=-1
        for col in range(n_col):
            column=X[:,col]
            uniq_val_in_col=np.unique(column)
            for val in uniq_val_in_col:
                cur_gain=self.information_gain(y,column,val)
                if cur_gain>gain:
                    gain=cur_gain
                    best_col=col
                    best_val=val
        return best_col,best_val
                
    def information_gain(self,y,column,val):
        S_entropy=self.cal_entropy(y)
        indx_left_child,indx_right_child=self.split_data(column,val)
        left_child,right_child=y[indx_left_child],y[indx_right_child]
        n_left,n_right=len(left_child),len(right_child)
        total=n_left+n_right
        child_entropy=(n_left/total)*self.cal_entropy(left_child)+(n_right/total)*self.cal_entropy(right_child)
        ig=S_entropy-child_entropy
        return ig
        
    def cal_entropy(self,y):
        uniq_lable,count=np.unique(y,return_counts=True)
        probabilities=count/count.sum()
        entropy=sum(probabilities*(-np.log2(probabilities)))
#         entropy = sum(- probabilities*np.log2(probabilities)-probabilities*np.log2(probabilities))
        return entropy
        
    def split_data(self,X_column,thresh):
        indx_left_child=np.argwhere(X_column<=thresh).flatten()
        indx_right_child=np.argwhere(X_column>thresh).flatten()
        return indx_left_child,indx_right_child
    
    
    def most_common_lable(self,y):
        lable,count=np.unique(y,return_counts=True)
        lable_index=np.argmax(count)
        result=lable[lable_index]
        return result
    def accuracy(self,y_test):
        
        acc=sum(self.prediction==y_test)/len(y_test)
        return acc
decision_tree=Decision_Tree()

In [100]:
# Training the model

decision_tree.fit(X_train,y_train)

In [101]:
# Making predictions

pred=decision_tree.predict(X_test)

In [102]:
# Accuracy

sum(pred==y_test)/len(y_test)

0.9298245614035088

# Comparing with Sklearn model

In [103]:
from sklearn.tree import DecisionTreeClassifier

In [104]:
classifer=DecisionTreeClassifier(criterion='entropy',splitter='best',max_depth=5,min_samples_split=2)

In [105]:
classifer.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5)

In [106]:
tree_predited=classifer.predict(X_test)

In [107]:
sum(tree_predited==y_test)/len(y_test)

0.9122807017543859