In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [10]:
#df = pd.read_csv("/Users/mdrizwanulaminadmin/RINTS/UTS/Machine learning /800diabetes.csv")
df = pd.read_csv("/Users/mdrizwanulaminadmin/RINTS/UTS/Machine learning /capstone2.csv")
df = df.drop(['ID','No_Pation','Gender'],axis=1) 
df = df.iloc[:, :-1]
df

Unnamed: 0,AGE,Urea,Cr (Creatinine Ratio),HbA1c (Sugar level),Cholesterol,TG (Triglycerides),HDL,LDL,VLDL,BMI,CLASS
0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N
...,...,...,...,...,...,...,...,...,...,...,...
995,71,11.0,97,7.0,7.5,1.7,1.2,1.8,0.6,30.0,Y
996,31,3.0,60,12.3,4.1,2.2,0.7,2.4,15.4,37.2,Y
997,30,7.1,81,6.7,4.1,1.1,1.2,2.4,8.1,27.4,Y
998,38,5.8,59,6.7,5.3,2.0,1.6,2.9,14.0,40.5,Y


In [11]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values.reshape(-1,1)

X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 40)

In [12]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left_Node=None, right_Node=None, info_gain=None, value=None):
        
        self.feature_index = feature_index     # for decision node
        self.threshold = threshold
        self.left_Node = left_Node
        self.right_Node = right_Node
        self.info_gain = info_gain
        
        self.value = value                    # leaf node

In [13]:
class ID3_Classifier():

    def __init__(self, min_samples_split = None, max_depth = None):
        
        self.root = None

        self.min_samples_split = min_samples_split
        
        self.max_depth = max_depth

    def build_tree(self, dataset, current_depth=0):

        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)

        if num_samples >= self.min_samples_split and current_depth <= self.max_depth:
            
            best_split = self.find_best_split(dataset, num_samples, num_features)
            
            if best_split["info_gain"]>0:
                
                left_subtree = self.build_tree(best_split["left_dataset"], current_depth +1)
                
                right_subtree = self.build_tree(best_split["right_dataset"], current_depth +1)
                
                return Node(best_split["feature_index"], best_split["threshold"], left_subtree, right_subtree, best_split["info_gain"])
    
        
        leaf_Value = self.calculate_leaf_Value(Y)                       # return leaf node
        
        return Node(value=leaf_Value)
    
    def find_best_split(self, dataset, num_samples, num_features):
        
        best_split = {}
        #best_info_gain = -float("inf")
        best_info_gain = -1
        
        for feature_index in range(num_features):

            feature_Values = dataset[:, feature_index]
            freq_Values = np.unique(feature_Values)
            
            for threshold in freq_Values:

                left_dataset = np.array([row for row in dataset if row[feature_index]<=threshold])

                right_dataset = np.array([row for row in dataset if row[feature_index]>threshold])
                
                if len(left_dataset)>0 and len(right_dataset)>0:

                    y, left_data_y, right_data_y = dataset[:, -1], left_dataset[:, -1], right_dataset[:, -1]

                    info_gain = self.information_Gain(y, left_data_y, right_data_y, mode="entropy")
                    
                    if  info_gain > best_info_gain:                        # update 
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["left_dataset"] = left_dataset
                        best_split["right_dataset"] = right_dataset
                        best_split["info_gain"] = info_gain
                        best_info_gain = info_gain
        
        return best_split
    
    def information_Gain(self, parent, left_child, right_child, mode = "entropy"):
        
        weight_left = len(left_child) / len(parent)
        weight_right = len(right_child) / len(parent)
        
        return self.entropy(parent) - (weight_left*self.entropy(left_child) + weight_right*self.entropy(right_child))
         
    
    def entropy(self, y):
        
        target_classes = np.unique(y)
        entropy = 0
        for Class in target_classes:

            p_Class = len(y[y == Class]) / len(y)

            entropy += -(p_Class * np.log2(p_Class))
            #entropy += -p_Class * np.log2(p_Class)


        return entropy
       
    def calculate_leaf_Value(self, Y):                     # ''' function to compute leaf node '''
        
        Y = list(Y)
        
        return max(Y, key = Y.count)
    
    def print_tree(self, tree = None, indent=" "):
                   
        if not tree:
           tree = self.root

        if tree.value is not None:
           print(tree.value)

        else:
            print("feature_index # "+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left_Node, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right_Node, indent + indent)
    
    def fit(self, X, Y):                             
                                              
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset, current_depth = 0)
    
    def predict(self, X):                             
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        
        return preditions
    
    def make_prediction(self, x, tree):
        
        if tree.value != None: return tree.value
        
        else:
            feature_val = x[tree.feature_index]
            if feature_val <= tree.threshold:
                return self.make_prediction(x, tree.left_Node)
            else:
                return self.make_prediction(x, tree.right_Node)


In [14]:
classifier = ID3_Classifier(min_samples_split=3, max_depth=3)

classifier.fit(X_train,Y_train)

classifier.print_tree()

feature_index # 9 <= 25.0 ? 0.3989226904091368
 left:feature_index # 3 <= 5.6 ? 0.6590772854562998
  left:feature_index # 4 <= 4.9 ? 0.26158106096604483
    left:feature_index # 9 <= 24.6 ? 0.3567800143839433
        left:N
        right:Y
    right:feature_index # 5 <= 1.9 ? 0.44477166784364586
        left:N
        right:Y
  right:feature_index # 3 <= 6.4 ? 0.9514122535330207
    left:P
    right:Y
 right:feature_index # 0 <= 48.0 ? 0.033189423654355316
  left:feature_index # 3 <= 6.3 ? 0.5713549744279549
    left:P
    right:Y
  right:Y


In [15]:
Y_pred = classifier.predict(X_test) 

accuracy = accuracy_score(Y_test, Y_pred)*100
print(f"Accuracy: {accuracy}%")

Accuracy: 98.5%
