<a href="https://colab.research.google.com/github/Noor-Z1/Machine-Learning/blob/main/Decision_Tree_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

from math import log2 as log

import numpy as np

# In the decision tree, non-leaf nodes are going to be represented via TreeNode

class TreeNode:
    def __init__(self, attribute):
        self.attribute = attribute
        # dictionary, k: subtree, key (k) an attribute value, value is either TreeNode or TreeLeafNode
        self.subtrees = {}


# In the decision tree, leaf nodes are going to be represented via TreeLeafNode
class TreeLeafNode:
    def __init__(self, data, label):
        self.data = data
        self.labels = label

class DecisionTree:
    def __init__(self, dataset: list, labels, features, criterion="information gain"):
        """
        :param dataset: array of data instances, each data instance is represented via an Python array
        :param labels: array of the labels of the data instances
        :param features: the array that stores the name of each feature dimension
        :param criterion: depending on which criterion ("information gain" or "gain ratio") the splits are to be performed
        """
        self.dataset = dataset
        self.labels = labels
        self.features = features
        self.criterion = criterion
        # it keeps the root node of the decision tree
        self.root = None

        # further variables and functions can be added...



    def calculate_entropy__(self, dataset, labels):
        """
        :param dataset: array of the data instances
        :param labels: array of the labels of the data instances
        :return: calculated entropy value for the given dataset
        """
        entropy_value = 0.0

        pos_count = 0
        neg_count = 0
        """
        Entropy calculations
        """
        #check if this is okay - that is do we need to make any mod if we want it to be more general entropy func

        ins_num = len(labels)

        for i in range( len(labels)):
            if labels [i] == 0:
              neg_count += 1
            else:
               pos_count += 1

        if pos_count == 0:
               entropy_value = 0  - (neg_count /ins_num)* log((neg_count /ins_num))
        elif neg_count == 0:
            entropy_value = -1*((pos_count) / ins_num) * log(((pos_count) / ins_num))  - 0
        else:
            entropy_value = -1*((pos_count) / ins_num) * log(((pos_count) / ins_num))  - (neg_count /ins_num)* log((neg_count /ins_num))
        return entropy_value


    def unique_params(self,dataset,labels,attribute):


        l = []

        for k in range(len(labels)):
            l.append(dataset[k][attribute])

        np.array(l)

        l = np.unique(l)

        return l



    def calculate_average_entropy__(self, dataset, labels, attribute):
        """
        :param dataset: array of the data instances on which an average entropy value is calculated
        :param labels: array of the labels of those data instances
        :param attribute: for which attribute an average entropy value is going to be calculated...
        :return: the calculated average entropy value for the given attribute
        """
        ins, dim = np.array(dataset).shape
        average_entropy = 0.0


        l = []
        for k in range(len(labels)):
            l.append(dataset[k][attribute])

        np.array(l)

        l = np.unique(l)
        #print(l)


        entropy_array = np.zeros(shape = np.shape(l))
        prob_array =  np.zeros(shape = np.shape(l))


        for i in range (len(l)):
          count_pos = 0
          count_neg = 0
          for k in range(len(labels)):
            if dataset[k][attribute] == l[i] and labels [k] == 0:
              count_neg+=1
            elif dataset[k][attribute] == l[i] and labels [k] == 1:
              count_pos+=1
            #print("count neg is: %d, count pos is: %d and feature is %s" %(count_neg, count_pos, l[i]))
          if count_pos == 0 or count_neg ==0 :
            entropy_array[i] = 0
          else:
            entropy_array[i] = (-1* (count_pos)/(count_pos+count_neg) * log((count_pos)/(count_pos+count_neg) ))  -  ((count_neg)/(count_pos+count_neg) * log((count_neg)/(count_pos+count_neg)))
          prob_array [i] =  (count_pos + count_neg) / (len(labels))

        #print(entropy_array)
        #print(prob_array)



        for i in range(len(entropy_array)):
          average_entropy += prob_array[i] * entropy_array[i]


        return average_entropy


    def calculate_information_gain__(self, dataset, labels, attribute):
        """
        :param dataset: array of the data instances on which an information gain score is going to be calculated
        :param labels: array of the labels of those data instances
        :param attribute: for which attribute the information gain score is going to be calculated...
        :return: the calculated information gain score
        """
        information_gain = 0.0
        information_gain = self.calculate_entropy__(dataset,labels) - self.calculate_average_entropy__(dataset,labels, attribute)

        #print("for attribute: %d" %(attribute))
        #print(self.calculate_entropy__(dataset,labels))
        #print(self.calculate_average_entropy__(dataset,labels, attribute) )
        return information_gain


    def calculate_intrinsic_information__(self, dataset, labels, attribute):
        """
        :param dataset: array of data instances on which an intrinsic information score is going to be calculated
        :param labels: array of the labels of those data instances
        :param attribute: for which attribute the intrinsic information score is going to be calculated...
        :return: the calculated intrinsic information score
        """
        intrinsic_info = None


        ins, dim = dataset.shape()

        intrinsic_info= 0.0

        l = []
        for k in range(len(labels)):
            l.append(dataset[k][attribute])

        np.array(l)

        l = np.unique(l)

        pos_array = np.zeros(shape = np.shape(l))
        neg_array =  np.zeros(shape = np.shape(l))


        for i in range (len(l)):
          count_pos = 0
          count_neg = 0
          for k in range(len(labels)):
            if  dataset[k][attribute] == l[i]  and labels[k] == 0:
              count_neg+=1;
            else:
              count_pos+=1;
          pos_array[i] = count_pos
          neg_array[i] = count_neg

        for i in range(len(l)):
          intrinsic_info += -1 * (  (pos_array[i]+neg_array[i]) /(len(labels)) * log( (pos_array[i]+neg_array[i])/(len(labels)) ) )


        return intrinsic_info

    def calculate_gain_ratio__(self, dataset, labels, attribute):


        return self.calculate_information_gain__(dataset,labels,attribute) / self.calculate_intrinsic_information__(dataset,labels,attribute)



    def split_dataset(self, dataset, labels, attribute, param):


       sub_dataset_0 = []
       sub_dataset_1 = []
       sub_labels_0 = []
       sub_labels_1 = []

       for i in range(len(labels)):
          if dataset[i][attribute] == param :
            sub_dataset_0.append(dataset[i] )
            sub_labels_0.append(labels[i])
          else:
            sub_dataset_1.append(dataset[i])
            sub_labels_1.append(labels[i])


       return  sub_dataset_0,  sub_labels_0 , sub_dataset_1, sub_labels_1

    def ID3__(self, dataset, labels, used_attributes, features):
        """
        Recursive function for ID3 algorithm
        :param dataset: data instances falling under the current tree node
        :param labels: labels of those instances
        :param used_attributes: while recursively constructing the tree, already used labels should be stored in used_attributes
        :return: it returns a created non-leaf node or a created leaf node
        """
        """
            Your implementation
        """


        #l = []
        used_features_indexes = 0
        #find entropy of current dataset
        #find information gain for current dataset (all features)
        #then select the attribute with highest info gain
        #then split dataset based on the feature and its param selected and repeat the entire calculations

        if self.criterion == "information gain":

          if len(np.unique(labels)) == 1:
            return TreeLeafNode(dataset,labels)

          else:
             #selected feature shouldnt be an already used feature


             feature_info_gain = [self.calculate_information_gain__(dataset,labels,feature)  for feature in range(len(features))]

             used_feature_index = np.argmax(np.array(feature_info_gain))

             l = self.unique_params(dataset,labels,used_feature_index)
             print(l)
             A = TreeNode(features[np.argmax(np.array(feature_info_gain))])

             print(A.attribute)

             if len(used_attributes) == 0:
               self.root = A

             used_attributes.append(features[np.argmax(np.array(feature_info_gain))])

             for i in range(len(l)):
                split_set, split_label, split_set_rem, split_label_rem = self.split_dataset(dataset,labels,np.argmax(np.array(feature_info_gain)), l[i])
                #print(split_set)
                A.subtrees[i]= self.ID3__(split_set, split_label, used_attributes, features)

             return A


    def predict(self, x):
        """
        :param x: a data instance, 1 dimensional Python array
        :return: predicted label of x

        If a leaf node contains multiple labels in it, the majority label should be returned as the predicted label
        """
        predicted_label = None
        """
            Your implementation
        """

        Treenode= self.root

        while type(Treenode)!= TreeLeafNode :

          #if Treenode == self.root:
             bool_array = [ Treenode.attribute == features[i]  for i in  range(len(features)) ]
             index = np.where(bool_array)[0][0]
             l = self.unique_params(self.dataset,self.labels,index)
             #l indexed represent the keys of subtrees of the root
             #extract the param at feature index
             instance_param = x[index]

             bool_array2 = [ instance_param == l[i]  for i in  range(len(l)) ]
             subtree_key = np.where(bool_array2)[0][0]
             Treenode = Treenode.subtrees[subtree_key]

        if type(Treenode) == TreeLeafNode :
          predicted_label = Treenode.labels[0]


        return predicted_label



    def train(self):

        self.root = self.ID3__(self.dataset, self.labels, [], self.features)

        print("Training completed")

In [None]:
import numpy as np
import math
#from ID3 import DecisionTree


features = ["Temperature", "Outlook", "Humidity", "Windy"]
# Golf played?...
labels = [0,0,1,1,1,0,1,1,1,1,1,0,0,1]
dataset = [
["hot" ,"sunny", "high", "false"],
["hot" ,"sunny", "high", "true"],
["hot" ,"overcast", "high", "false"],
["cool", "rain", "normal", "false"],
["cool", "overcast", "normal", "true"],
["mild", "sunny", "high", "false"],
["cool", "sunny", "normal", "false"],
["mild", "rain", "normal", "false"],
["mild", "sunny", "normal", "true"],
["mild", "overcast", "high", "true"],
["hot" ,"overcast", "normal", "false"],
["mild", "rain", "high", "true"],
["cool", "rain", "normal", "true"],
["mild", "rain", "high", "false"]]


"""
tree= DecisionTree(dataset, labels, features, "information gain" )

tree.train()


print(tree.root.attribute)

for i in range(len(tree.root.subtrees)):

  if type(tree.root.subtrees[i]) == TreeNode   :
   print(tree.root.subtrees[i].attribute)
   for k in range(len(tree.root.subtrees[i].subtrees)):
     if type(tree.root.subtrees[i].subtrees[k]) == TreeNode:
       print(tree.root.subtrees[i].subtrees[k].attribute)
     else:
          print(tree.root.subtrees[i].subtrees[k].labels)
          print("\n")

  else:
     print(tree.root.subtrees[i].labels)
     print("\n")

"""

#print( np.where(["Temperature" == features[i]  for i in  range(len(features)) ])[0][0] )



dt = DecisionTree(dataset, labels, features)
dt.train()
correct = 0
wrong = 0
for data_index in range(len(dataset)):
    data_point = dataset[data_index]
    data_label = labels[data_index]

    predicted = dt.predict(data_point)
    if predicted == data_label:
        correct += 1
    else:
        wrong += 1


print("Accuracy : %.2f" % (correct/(correct+wrong)*100))

#print(tree.root.subtrees[1].subtrees[0].labels)

['overcast' 'rain' 'sunny']
Outlook
['false' 'true']
Windy
['high' 'normal']
Humidity
Training completed
Accuracy : 100.00
