In [6]:
import numpy as np
from collections import Counter

class Node:
    def __init__(self,feature = None,threshold = None,left = None,right = None,*,value= None,n_value=None) -> None:
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
    def is_leaf_node(self):
        return not self.value is None
    
    def __str__(self):
        if self.feature:return f'{self.feature} > {self.threshold}'
        return f'value = {self.value}'

In [7]:
class DecisionTree:
    
    def __init__(self,min_samples_split =15 ,max_depth = 4 ,n_features = 0 ,root = None) -> None:
        
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_features = n_features
        self.root = root
        self.features_importance = {}
    
    def __calulate_entropy(self,y):
        y = y.to_numpy().flatten()
        hist = np.bincount(y)
        ps = hist/len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])
    
    def __split_data(self,X,threshold):
        
        left_idxs = X[X<=threshold].index
        right_idxs = X[X>threshold].index
        
        return left_idxs,right_idxs
        
    def __information_gain(self,X,threshold,y):

        parent_entropy = self.__calulate_entropy(y)

        left_idxs,right_idxs = self.__split_data(X,threshold)
        
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0
        
        n = len(y)
        n_l , n_r =  len(left_idxs), len(right_idxs)
        e_l , e_r = self.__calulate_entropy(y.loc[list(left_idxs)]), self.__calulate_entropy(y.loc[list(right_idxs)])
        
        child_entropy = (n_l/n)*e_l + (n_r/n)*e_r
        
        
        information_gain = parent_entropy - child_entropy
        
        return information_gain
    
    
    def __update_features_importance(self,best_feature,best_threshold,depth):
        
        if not self.features_importance.get(depth):
            self.features_importance[depth] = [[best_feature,best_threshold]]
        else:
            self.features_importance[depth].append([best_feature,best_threshold])
            
        
        
    def __best_split(self,X,y):
        stop_split = False
        best_ig = 0
        features_name = X.columns
        best_feature = 'Not found'
        best_threshold = 'Not found'
        for feature_name in features_name:
            for threshold in np.unique(X[feature_name]):
                    
                ig = self.__information_gain(X[feature_name],threshold,y)
                
                if ig > best_ig:
                    best_ig = ig
                    best_threshold = threshold
                    best_feature = feature_name
                    print(f'best_feature best_threshold {best_feature} {best_threshold}') if self.verbose == 2 else ''
        
        if best_ig == 0:
            stop_split = True
            print('early stop : best_ig = 0')
            return None,None,stop_split
        print(f'best_feature,best_threshold {best_feature,best_threshold}\n')if self.verbose else ''
        
        return best_feature,best_threshold,stop_split
        
    @staticmethod
    def __most_common_label(y):
        counter = Counter(y.to_numpy().flatten())
        most_common = counter.most_common(1)[0][0]
        # print(most_common)
        return most_common
    
    
    def __grow_tree(self,X,y,current_depth=0):
        
        if (current_depth>self.max_depth) or (len(X.columns) == 0) or (len(np.unique(y)) == 1):
            if current_depth>self.max_depth:print("max depth reach")
            return Node(value=self.__most_common_label(y),n_value=len(y))
        
        print(f'current depth = {current_depth}') if self.verbose else ''
        
        best_feature,best_threshold,stop_grow =self.__best_split(X,y)
        
        if stop_grow:return Node(value=self.__most_common_label(y),n_value=len(y))
        
        left_idxs,right_idxs = self.__split_data(X[best_feature],best_threshold)
        
        if len(left_idxs) < self.min_samples_split or len(right_idxs) < self.min_samples_split:
            print("min samples split reach")
            return Node(value=self.__most_common_label(y),n_value=len(y))
       
        self.__update_features_importance(best_feature,best_threshold,current_depth) 
        
        left = self.__grow_tree(X.drop(best_feature,axis=1).loc[left_idxs],y.loc[left_idxs],current_depth+1)

        right = self.__grow_tree(X.drop(best_feature,axis=1).loc[right_idxs],y.loc[right_idxs],current_depth+1)
        
        
        return Node(best_feature,best_threshold,left,right)
    
    def fit(self,X,y,verbose=True):
        self.verbose = verbose
        self.n_features = X.shape[1]
        self.features_name = X.columns
        self.root = self.__grow_tree(X,y)
        
        predictions_train = self.predict(X)
        print('train accuracy : ',np.sum(predictions_train == y.to_numpy().flatten())/len(y)*100,'%')
        
    def __traverse_tree(self,X,node:Node):
        if node.is_leaf_node():
            return node.value
        
        if X[node.feature] <= node.threshold:
            return self.__traverse_tree(X,node.left)
        
        return self.__traverse_tree(X,node.right)
              
    def predict(self,X):
        return np.array([self.__traverse_tree(x[1],self.root) for x in X.iterrows()])
        
    
        

In [8]:
def print_tree(node:Node):
    print(node)
    if node.is_leaf_node():return
    print_tree(node.left)
    print_tree(node.right)
    

In [9]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd

data = datasets.load_breast_cancer()


x = pd.DataFrame(data.data,columns=data.feature_names)
y = pd.DataFrame(data.target,columns=['target'])

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2 ,random_state=42)

clf = DecisionTree(min_samples_split=5,max_depth=2)
clf.fit(X_train,y_train)


current depth = 0
best_feature,best_threshold ('mean concave points', 0.05074)

current depth = 1
best_feature,best_threshold ('worst radius', 16.77)

current depth = 2
best_feature,best_threshold ('radius error', 0.6061)

min samples split reach
current depth = 2
best_feature,best_threshold ('mean texture', 15.7)

max depth reach
max depth reach
current depth = 1
best_feature,best_threshold ('worst perimeter', 114.3)

current depth = 2
best_feature,best_threshold ('worst texture', 25.47)

max depth reach
max depth reach
train accuracy :  97.8021978021978 %


In [10]:
predictions_train = clf.predict(X_train)
predictions_test = clf.predict(X_test)

print('train accuracy : ',np.sum(predictions_train == y_train.to_numpy().flatten())/len(y_train)*100,'%')
print('test accuracy : ',np.sum(predictions_test == y_test.to_numpy().flatten())/len(y_test)*100,'%')

train accuracy :  97.8021978021978 %
test accuracy :  95.6140350877193 %


In [11]:
clf.features_importance

{0: [['mean concave points', 0.05074]],
 1: [['worst radius', 16.77], ['worst perimeter', 114.3]],
 2: [['mean texture', 15.7], ['worst texture', 25.47]]}

In [12]:
print_tree(clf.root)

mean concave points > 0.05074
worst radius > 16.77
value = 1
mean texture > 15.7
value = 1
value = 0
worst perimeter > 114.3
worst texture > 25.47
value = 1
value = 0
value = 0


In [16]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data",header=None)
df.columns = ['age','sex','cp','restbp','chol','fbs','restecg','thalech','exang','oldpeak','slope','ca','thal','hd']
df_nomissing = df.loc[(df['ca']!='?')&(df['thal'] != '?')]
# X = df_nomissing.drop('hd',axis=1)
df_nomissing

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalech,exang,oldpeak,slope,ca,thal,hd
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3


In [17]:
X_encoded = pd.get_dummies(df_nomissing,columns=['cp','restecg','slope','thal','ca'])
X_encoded

Unnamed: 0,age,sex,restbp,chol,fbs,thalech,exang,oldpeak,hd,cp_1.0,...,slope_1.0,slope_2.0,slope_3.0,thal_3.0,thal_6.0,thal_7.0,ca_0.0,ca_1.0,ca_2.0,ca_3.0
0,63.0,1.0,145.0,233.0,1.0,150.0,0.0,2.3,0,1,...,0,0,1,0,1,0,1,0,0,0
1,67.0,1.0,160.0,286.0,0.0,108.0,1.0,1.5,2,0,...,0,1,0,1,0,0,0,0,0,1
2,67.0,1.0,120.0,229.0,0.0,129.0,1.0,2.6,1,0,...,0,1,0,0,0,1,0,0,1,0
3,37.0,1.0,130.0,250.0,0.0,187.0,0.0,3.5,0,0,...,0,0,1,1,0,0,1,0,0,0
4,41.0,0.0,130.0,204.0,0.0,172.0,0.0,1.4,0,0,...,1,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57.0,0.0,140.0,241.0,0.0,123.0,1.0,0.2,1,0,...,0,1,0,0,0,1,1,0,0,0
298,45.0,1.0,110.0,264.0,0.0,132.0,0.0,1.2,1,1,...,0,1,0,0,0,1,1,0,0,0
299,68.0,1.0,144.0,193.0,1.0,141.0,0.0,3.4,2,0,...,0,1,0,0,0,1,0,0,1,0
300,57.0,1.0,130.0,131.0,0.0,115.0,1.0,1.2,3,0,...,0,1,0,0,0,1,0,1,0,0


In [18]:
df_majority = df[df.hd==0]
df_minority1 = df[df.hd==1]
df_minority2 = df[df.hd==2]
df_minority3 = df[df.hd==3]
df_minority4 = df[df.hd==4]

df_minority_oversampled1 = df_minority1.sample(replace=True, n=len(df_majority), random_state=42)
df_minority_oversampled2 = df_minority2.sample(replace=True, n=len(df_majority), random_state=42)
df_minority_oversampled3 = df_minority3.sample(replace=True, n=len(df_majority), random_state=42)
df_minority_oversampled4 = df_minority4.sample(replace=True, n=len(df_majority), random_state=42)

df_oversampled = pd.concat([df,df_minority_oversampled1,df_minority_oversampled2,df_minority_oversampled3,df_minority_oversampled4])
df_oversampled = df_oversampled.reset_index(drop=True)
df_oversampled

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalech,exang,oldpeak,slope,ca,thal,hd
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
954,64.0,1.0,4.0,145.0,212.0,0.0,2.0,132.0,0.0,2.0,2.0,2.0,6.0,4
955,61.0,1.0,4.0,138.0,166.0,0.0,2.0,125.0,1.0,3.6,2.0,1.0,3.0,4
956,58.0,1.0,4.0,114.0,318.0,0.0,1.0,140.0,0.0,4.4,3.0,3.0,6.0,4
957,58.0,1.0,3.0,112.0,230.0,0.0,2.0,165.0,0.0,2.5,2.0,1.0,7.0,4


In [19]:
y = X_encoded['hd'] 
y

0      0
1      2
2      1
3      0
4      0
      ..
297    1
298    1
299    2
300    3
301    1
Name: hd, Length: 297, dtype: int64

In [20]:
y.unique()

array([0, 2, 1, 3, 4], dtype=int64)

In [21]:
Counter(y)

Counter({0: 160, 2: 35, 1: 54, 3: 35, 4: 13})

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X_encoded.drop('hd',axis=1),y,test_size=0.2 ,random_state=42)

clf = DecisionTree(min_samples_split=3,max_depth=15)
clf.fit(X_train,y_train,verbose=False)

min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
min samples split reach
train accuracy :  73.83966244725738 %


In [23]:
clf.features_importance

{0: [['ca_0.0', 0]],
 1: [['cp_4.0', 0], ['thal_7.0', 0]],
 2: [['oldpeak', 1.8],
  ['restecg_0.0', 0],
  ['thalech', 147.0],
  ['oldpeak', 1.9]],
 3: [['ca_3.0', 0],
  ['age', 63.0],
  ['age', 57.0],
  ['restecg_0.0', 0],
  ['thalech', 120.0]],
 4: [['thalech', 118.0],
  ['exang', 0.0],
  ['chol', 274.0],
  ['restbp', 108.0],
  ['fbs', 0.0],
  ['chol', 274.0],
  ['thalech', 132.0]],
 5: [['chol', 246.0],
  ['restbp', 125.0],
  ['oldpeak', 1.2],
  ['age', 50.0],
  ['chol', 211.0]],
 6: [['chol', 212.0], ['chol', 325.0], ['restbp', 142.0]]}

In [24]:
predictions_train = clf.predict(X_train)
predictions_test = clf.predict(X_test)

print('train accuracy : ',np.sum(predictions_train == y_train.to_numpy().flatten())/len(y_train)*100,'%')
print('test accuracy : ',np.sum(predictions_test == y_test.to_numpy().flatten())/len(y_test)*100,'%')

train accuracy :  73.83966244725738 %
test accuracy :  60.0 %


In [25]:
predictions_train

array([0, 0, 0, 1, 0, 0, 1, 0, 3, 1, 2, 2, 2, 0, 1, 1, 0, 0, 0, 0, 0, 3,
       0, 1, 2, 0, 0, 0, 0, 2, 4, 1, 0, 0, 0, 0, 3, 0, 0, 2, 0, 0, 2, 0,
       0, 0, 2, 3, 2, 0, 1, 0, 0, 0, 2, 0, 3, 1, 2, 0, 0, 4, 0, 3, 4, 0,
       3, 0, 0, 2, 0, 3, 0, 3, 2, 2, 1, 0, 1, 2, 4, 1, 2, 0, 1, 3, 3, 0,
       2, 0, 0, 2, 0, 3, 2, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 1, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, 2, 0, 4, 2, 0, 0,
       1, 4, 0, 0, 0, 3, 3, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 2, 2, 0, 0,
       3, 0, 0, 0, 0, 4, 3, 3, 0, 0, 2, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 3, 0, 0, 2, 2, 0, 2, 3, 2, 3, 0, 0, 0, 0, 0, 0, 3,
       3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [26]:
predictions_test

array([0, 2, 0, 4, 0, 0, 0, 0, 2, 0, 3, 0, 0, 2, 0, 0, 0, 0, 3, 0, 2, 3,
       0, 0, 4, 0, 3, 3, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 4, 1, 0, 2, 1, 0,
       0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 2], dtype=int64)

In [27]:
Counter(predictions_test)

Counter({0: 38, 2: 8, 4: 4, 3: 6, 1: 4})

In [28]:
data = datasets.load_iris()


x = pd.DataFrame(data.data,columns=data.feature_names)
y = pd.DataFrame(data.target,columns=['target'])

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2 ,random_state=10)

clf = DecisionTree(min_samples_split=5,max_depth=5)
clf.fit(X_train,y_train)

current depth = 0
best_feature,best_threshold ('petal length (cm)', 1.9)

current depth = 1
best_feature,best_threshold ('petal width (cm)', 1.7)

current depth = 2
best_feature,best_threshold ('sepal length (cm)', 7.0)

min samples split reach
current depth = 2
best_feature,best_threshold ('sepal length (cm)', 5.9)

current depth = 3
best_feature,best_threshold ('sepal width (cm)', 3.0)

min samples split reach
train accuracy :  95.0 %


In [29]:
predictions_train = clf.predict(X_train)
predictions_test = clf.predict(X_test)

print('train accuracy : ',np.sum(predictions_train == y_train.to_numpy().flatten())/len(y_train)*100,'%')
print('test accuracy : ',np.sum(predictions_test == y_test.to_numpy().flatten())/len(y_test)*100,'%')

train accuracy :  95.0 %
test accuracy :  100.0 %


In [30]:
predictions_train

array([1, 1, 1, 2, 2, 0, 2, 2, 2, 2, 0, 0, 1, 0, 1, 0, 1, 2, 2, 2, 1, 2,
       1, 1, 1, 0, 0, 1, 0, 2, 0, 0, 1, 1, 2, 0, 2, 0, 1, 1, 0, 2, 2, 2,
       2, 1, 0, 1, 2, 1, 0, 2, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 0, 0, 2, 2,
       1, 1, 2, 2, 2, 1, 1, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 2, 0, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 1, 1, 2, 0, 2, 0,
       0, 1, 0, 2, 2, 2, 1, 0, 2, 0])

In [31]:
predictions_test

array([1, 2, 0, 1, 0, 1, 1, 1, 0, 1, 1, 2, 1, 0, 0, 2, 1, 0, 0, 0, 2, 2,
       2, 0, 1, 0, 1, 1, 1, 2])

In [32]:
clf.features_importance

{0: [['petal length (cm)', 1.9]],
 1: [['petal width (cm)', 1.7]],
 2: [['sepal length (cm)', 5.9]]}

In [33]:
data = datasets.load_sample_images()
img_0  =  np.array(data['images'][0])
img_1  =  np.array(data['images'][1])
img_0.shape

(427, 640, 3)

In [34]:
img = []
label = []
for i in img_0:
    img.append(i.flatten())
    label.append(0)
for i in img_1:
    img.append(i.flatten())
    label.append(1)


In [35]:
np.array(img).shape

(854, 1920)

In [36]:
label = pd.Series(label)
label

0      0
1      0
2      0
3      0
4      0
      ..
849    1
850    1
851    1
852    1
853    1
Length: 854, dtype: int64

In [37]:
df = pd.DataFrame(img)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1910,1911,1912,1913,1914,1915,1916,1917,1918,1919
0,-82,-55,-25,-82,-55,-25,-82,-55,-25,-82,...,-2,-6,-5,-1,-6,-5,-1,-6,-5,-1
1,-84,-57,-27,-83,-56,-26,-83,-56,-26,-82,...,-1,-5,-4,-1,-5,-4,-1,-5,-4,-1
2,-82,-55,-25,-82,-55,-25,-82,-55,-25,-82,...,-1,-4,-3,-1,-4,-3,-1,-4,-3,-1
3,-81,-54,-24,-81,-54,-24,-81,-54,-24,-81,...,-1,-4,-3,-1,-4,-3,-1,-4,-3,-1
4,-82,-55,-26,-82,-55,-26,-81,-54,-25,-81,...,-3,-5,-3,-4,-5,-3,-4,-5,-3,-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849,2,48,45,2,49,43,2,49,39,2,...,25,6,43,25,8,42,25,8,42,25
850,0,46,43,0,47,41,0,47,37,1,...,24,6,43,25,8,42,26,7,41,25
851,0,46,40,1,48,40,1,47,37,1,...,25,5,44,26,6,43,26,7,44,27
852,0,47,41,1,48,40,1,47,37,2,...,26,6,45,27,7,44,27,7,44,27


In [38]:

X_train,X_test,y_train,y_test = train_test_split(df,label,test_size=0.2 ,random_state=10)

clf = DecisionTree(min_samples_split=5,max_depth=5)
clf.fit(X_train,y_train)

current depth = 0
best_feature,best_threshold (1474, 30)

current depth = 1


In [None]:
predictions_train = clf.predict(X_train)
predictions_test = clf.predict(X_test)

print('train accuracy : ',np.sum(predictions_train == y_train.to_numpy().flatten())/len(y_train)*100,'%')
print('test accuracy : ',np.sum(predictions_test == y_test.to_numpy().flatten())/len(y_test)*100,'%')

train accuracy :  99.85358711566617 %
test accuracy :  98.83040935672514 %


In [None]:
clf.features_importance

{0: [[1474, 30]], 1: [[387, 8]]}

In [None]:
data = datasets.load_iris()


x = pd.DataFrame(data.data,columns=data.feature_names)
y = pd.DataFrame(data.target,columns=['target'])

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2 ,random_state=42)

clf = DecisionTree(min_samples_split=5,max_depth=5)
clf.fit(X_train,y_train)

current depth = 0
best_feature,best_threshold ('petal length (cm)', 1.9)

current depth = 1
best_feature,best_threshold ('petal width (cm)', 1.7)

current depth = 2
best_feature,best_threshold ('sepal length (cm)', 7.0)

min samples split reach
current depth = 2
best_feature,best_threshold ('sepal length (cm)', 5.9)

current depth = 3
best_feature,best_threshold ('sepal width (cm)', 3.0)

min samples split reach
train accuracy :  95.0 %


In [None]:
predictions_train = clf.predict(X_train)
predictions_test = clf.predict(X_test)

print('train accuracy : ',np.sum(predictions_train == y_train.to_numpy().flatten())/len(y_train)*100,'%')
print('test accuracy : ',np.sum(predictions_test == y_test.to_numpy().flatten())/len(y_test)*100,'%')

train accuracy :  95.0 %
test accuracy :  100.0 %
