In [245]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image


In [246]:
df = pd.read_csv("/Users/mdrizwanulaminadmin/RINTS/UTS/Machine learning /800diabetes.csv")
#df = pd.read_csv("/Users/mdrizwanulaminadmin/RINTS/UTS/Machine learning /capstone2.csv")
#df = df.drop(['ID','No_Pation','Gender'],axis=1) 
#df = df.iloc[:, :-1]
df

Unnamed: 0,AGE,Urea,HbA1c (Sugar level),Cholesterol,HDL,BMI,CLASS
0,50,4.7,4.9,4.2,2.4,24.0,N
1,26,4.5,4.9,3.7,1.1,23.0,N
2,50,4.7,4.9,4.2,2.4,24.0,N
3,50,4.7,4.9,4.2,2.4,24.0,N
4,33,7.1,4.9,4.9,0.8,21.0,N
...,...,...,...,...,...,...,...
795,60,4.9,10.2,3.9,1.3,29.0,Y
796,60,2.1,12.3,6.2,1.0,30.0,Y
797,61,7.1,13.6,6.6,1.1,31.0,Y
798,63,2.8,11.2,4.2,1.7,36.0,Y


In [247]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values.reshape(-1,1)

X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 40)


In [248]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left_Node=None, right_Node=None, info_gain=None, value=None):
        
        self.feature_index = feature_index     # for decision node
        self.threshold = threshold
        self.left_Node = left_Node
        self.right_Node = right_Node
        self.info_gain = info_gain
        
        self.value = value                    # leaf node

In [249]:
class ID3_Classifier():

    def __init__(self, min_samples_split = None, max_depth = None):
        
        self.root = None

        self.min_samples_split = min_samples_split
        
        self.max_depth = max_depth

    def build_tree(self, dataset, current_depth=0):

        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)

        if num_samples >= self.min_samples_split and current_depth <= self.max_depth:
            
            best_split = self.find_best_split(dataset, num_samples, num_features)
            
            if best_split["info_gain"]>0:
                
                left_subtree = self.build_tree(best_split["left_dataset"], current_depth +1)
                
                right_subtree = self.build_tree(best_split["right_dataset"], current_depth +1)
                
                return Node(best_split["feature_index"], best_split["threshold"], left_subtree, right_subtree, best_split["info_gain"])
    
        
        leaf_Value = self.calculate_leaf_Value(Y)
        # return leaf node
        return Node(value=leaf_Value)
    
    def find_best_split(self, dataset, num_samples, num_features):
        
        best_split = {}
        #best_info_gain = -float("inf")
        best_info_gain = -1
        
        for feature_index in range(num_features):

            feature_Values = dataset[:, feature_index]
            freq_Values = np.unique(feature_Values)
            
            for threshold in freq_Values:

                left_dataset = np.array([row for row in dataset if row[feature_index]<=threshold])

                right_dataset = np.array([row for row in dataset if row[feature_index]>threshold])
                
                if len(left_dataset)>0 and len(right_dataset)>0:

                    y, left_data_y, right_data_y = dataset[:, -1], left_dataset[:, -1], right_dataset[:, -1]

                    info_gain = self.information_Gain(y, left_data_y, right_data_y, mode="entropy")
                    
                    if  info_gain > best_info_gain:                        # update 
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["left_dataset"] = left_dataset
                        best_split["right_dataset"] = right_dataset
                        best_split["info_gain"] = info_gain
                        best_info_gain = info_gain
        
        return best_split
    
    def information_Gain(self, parent, left_child, right_child, mode = "entropy"):
        
        weight_left = len(left_child) / len(parent)
        weight_right = len(right_child) / len(parent)
        
        return self.entropy(parent) - (weight_left*self.entropy(left_child) + weight_right*self.entropy(right_child))
         
    
    def entropy(self, y):
        
        target_classes = np.unique(y)
        entropy = 0
        for Class in target_classes:

            p_Class = len(y[y == Class]) / len(y)

            entropy += -(p_Class * np.log2(p_Class))
            #entropy += -p_Class * np.log2(p_Class)

        return entropy
       
    def calculate_leaf_Value(self, Y):                     # ''' function to compute leaf node '''
        
        Y = list(Y)
        
        return max(Y, key = Y.count)

    
    def print_Tree(self, tree=None, indent=" "):

        feature_names = np.array(['AGE', 'Urea', 'HbA1c (Sugar level)', 'Cholesterol', 'HDL', 'BMI'])

        if tree is None:
            tree = self.root

        if tree.value is not None:
            if tree.value in ["N", "Y", "P"]:            # list of target classes 
                print(f"Leaf Node: {tree.value}")
            else:
                print(f"Leaf Node: Class {tree.value}")
        else:
            feature_name = feature_names[tree.feature_index] if tree.feature_index is not None else "Unknown"

            print(f"Decision Node: {feature_name} <= {tree.threshold} (Info Gain: {tree.info_gain})")
            print(f"{indent}Left:")
            self.print_Tree(tree.left_Node, indent + "  ")
            print(f"{indent}Right:")
            self.print_Tree(tree.right_Node, indent + "  ")

    def fit(self, X, Y):                              #''' function to train the tree '''
                                              
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset, current_depth = 0)
    
    def predict(self, X):                             #''' function to predict new dataset '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        
        return preditions
    
    def make_prediction(self, x, tree):
        
        if tree.value != None: return tree.value
        
        else:
            feature_val = x[tree.feature_index]
            if feature_val <= tree.threshold:
                return self.make_prediction(x, tree.left_Node)
            else:
                return self.make_prediction(x, tree.right_Node)


In [250]:
classifier = ID3_Classifier(min_samples_split=3, max_depth=3)

classifier.fit(X_train,Y_train)

classifier.print_Tree()

Decision Node: BMI <= 25.0 (Info Gain: 0.5047650762174729)
 Left:
Decision Node: HbA1c (Sugar level) <= 5.6 (Info Gain: 0.6904416791104199)
   Left:
Decision Node: Cholesterol <= 4.9 (Info Gain: 0.22005511305879277)
     Left:
Decision Node: BMI <= 24.6 (Info Gain: 0.3372900666170139)
       Left:
Leaf Node: N
       Right:
Leaf Node: Y
     Right:
Decision Node: Cholesterol <= 7.1 (Info Gain: 0.12148687778112766)
       Left:
Leaf Node: Y
       Right:
Leaf Node: N
   Right:
Decision Node: HbA1c (Sugar level) <= 6.4 (Info Gain: 0.6840384356390417)
     Left:
Leaf Node: P
     Right:
Leaf Node: Y
 Right:
Decision Node: AGE <= 48.0 (Info Gain: 0.038015507838487)
   Left:
Decision Node: HbA1c (Sugar level) <= 6.3 (Info Gain: 0.6840384356390417)
     Left:
Leaf Node: P
     Right:
Leaf Node: Y
   Right:
Leaf Node: Y


In [251]:
Y_pred = classifier.predict(X_test) 

accuracy = accuracy_score(Y_test, Y_pred)*100
print(f"Accuracy: {accuracy}%")



Accuracy: 98.75%
